load libraries

INDEX Fig1. a. (left) 394 (right) 227

Figure 1. a. (left) 394, stats 337 (right) 227 b. 459, stats 337 c. 2591, stats 2647 d. 2730 e. 2430

Figure 2. a. 679 stats, 585 b. 1212, stats 1119 c. 2591, stats 2647 d. 2430 e. 2972, stats 2972 f. right 2998, stats 2998; left 3021, stats 2998

Figure S2. a. 896 b. 1372 c. 974

Figure S3. a. 1172, stats 1172 b. 1267 c. 863

Figure S4. a. 1412 b. 1474 c. 1571 d. NA e. 2050

Figure 3. a. 1912, stats 1847 b. left 2756, right 2851, stats 2851

Figure 4. a. 2709, stats 2647 b. 2730 c. 2430, stats 2549

Figure 5. a. 2647, stats 2690 b. 2506, 3183 c. 2765, 3154 d. left 3135, stats 3135; right 3106, stats 3106

Figure 6. a. left 3396, right 3396 b. 3396, stats 3396 c. 3396, stats 3396 d. 3531, 3591 e. 3481, 3522

PRS**************************************

Input the PRS data

# Read the PRS data
prs_data <- read.table('/Users/tmt6052/Library/CloudStorage/OneDrive-NorthwesternUniversity/SFRN_paper_items/R_analysis/final_for_paper/PRS_array_for_manuscript.txt', header = TRUE, sep = '\t', stringsAsFactors = FALSE)

Set color palette

group_colors  <- c("#3C8C78","grey") 

Define your list of traits to analyze

# List of traits to analyze
traits <- c("Heart_rate", "LQTS", "PR_interval", "QTc", "Afib", "HRV", "BP", "QRS", "HCM", "LVEDV","LVEDVI","LVEF","LVESV","LVESVI","SV","SVI","GGE","Focal_Epilepsy")
traits
 [1] "Heart_rate"     "LQTS"           "PR_interval"    "QTc"           
 [5] "Afib"           "HRV"            "BP"             "QRS"           
 [9] "HCM"            "LVEDV"          "LVEDVI"         "LVEF"          
[13] "LVESV"          "LVESVI"         "SV"             "SVI"           
[17] "GGE"            "Focal_Epilepsy"
prs_data <- prs_data %>%
  mutate(across(all_of(traits), ~ scale(.x) %>% as.numeric, .names = "Normalized_{.col}"))
group_definitions <- list(
  "Control" = c(1),
  "Case"= c(2,3,4,5,6)
)

Format the data

# Function to determine the group for a given arrest ontology value
get_group <- function(arrest_ontology_value) {
  for (group_name in names(group_definitions)) {
    if (arrest_ontology_value %in% group_definitions[[group_name]]) {
      return(group_name)
    }
  }
  return(as.character(arrest_ontology_value)) 
}

# Categorize arrest_ontology into specified groups
prs_data <- prs_data %>%
  rowwise() %>%
  mutate(arrest_group = get_group(arrest_ontology))

# Filter out only the relevant groups for the plot
relevant_groups <- names(group_definitions)
filtered_data <- prs_data %>%
  filter(arrest_group %in% relevant_groups)

# Generate normalized column names based on the traits list
normalized_traits <- paste("Normalized", traits, sep = "_")

# Filter out non-existing columns from the normalized_traits
existing_normalized_traits <- normalized_traits[normalized_traits %in% names(filtered_data)]

# Generate normalized column names based on the traits list
normalized_traits <- paste("Normalized", traits, sep = "_")




# Reshape the filtered data to long format for the normalized traits
melted_data_normalized <- melt(filtered_data, 
                               id.vars = c('arrest_group'), 
                               measure.vars = existing_normalized_traits)

Split into discovery or replication

prs_data_discovery <- prs_data %>%
  filter(Set == "Discovery")

prs_data_replication <- prs_data %>%
  filter(Set == "Replication")

Make the Corrplot

# Select only the relevant columns for correlation
data_for_correlation <- prs_data_discovery[, normalized_traits]

#order the traits
corr_traits <- c("Normalized_BP", "Normalized_Heart_rate", "Normalized_HRV", "Normalized_PR_interval", "Normalized_QTc", "Normalized_QRS",
                    "Normalized_LVEDV", "Normalized_LVEDVI", "Normalized_LVEF", "Normalized_LVESV", "Normalized_LVESVI", "Normalized_SV", "Normalized_SVI",
                    "Normalized_LQTS", "Normalized_Afib", "Normalized_HCM", "Normalized_GGE", "Normalized_Focal_Epilepsy")

# Ensure the DataFrame is ordered according to trait preferences
data_for_correlation_ordered <- data_for_correlation[, corr_traits]

# Calculate correlation matrix
cor_matrix_ordered <- cor(data_for_correlation_ordered, use = "complete.obs")

#plot correlation
corrplot(cor_matrix_ordered, 
         method = "circle",
         type = "full",
         tl.col = "black",
         tl.srt = 45,
         cl.ratio = 0.2,
         col = colorRampPalette(c("#05618F", "white", "#F0BE3C"))(200),
         diag = FALSE)

Test for ANOVA

check_anova_assumptions <- function(data, trait) {
  # Ensure 'arrest_group' is a factor
  data$arrest_group <- as.factor(data$arrest_group)
  
  # Fit the ANOVA model
  formula <- as.formula(paste(trait, "~ arrest_group"))
  anova_model <- aov(formula, data = data)
  
  # Extract residuals
  residuals <- residuals(anova_model)
  
  # Shapiro-Wilk test for normality
  shapiro_test <- shapiro.test(residuals)
  shapiro_p_value <- shapiro_test$p.value
  
  # Levene's Test for homogeneity of variances
  library(car)
  levene_test <- leveneTest(formula, data = data)
  levene_p_value <- levene_test$`Pr(>F)`[1]
  
  # Bartlett's Test for homogeneity of variances
  bartlett_test <- bartlett.test(formula, data = data)
  bartlett_p_value <- bartlett_test$p.value
  
  # Create a summary table with the test results
  data.frame(
    Trait = trait,
    Shapiro_Wilk_p_value = shapiro_p_value,
    Levene_p_value = levene_p_value,
    Bartlett_p_value = bartlett_p_value
  )
}

anova_assumptions_results <- lapply(traits, function(trait) check_anova_assumptions(filtered_data, trait))
Loading required package: carData

Attaching package: 'car'
The following object is masked from 'package:dplyr':

    recode
# Combine the results into a single data frame
anova_assumptions_df <- do.call(rbind, anova_assumptions_results)

# Print the results
print(anova_assumptions_df)
            Trait Shapiro_Wilk_p_value Levene_p_value Bartlett_p_value
1      Heart_rate         5.271165e-05   5.681662e-01     1.285170e-01
2            LQTS         3.162300e-01   2.617585e-01     2.361534e-01
3     PR_interval         6.446775e-02   8.649432e-01     9.542401e-01
4             QTc         2.262405e-08   6.712154e-01     6.782251e-01
5            Afib         8.268008e-01   3.494958e-01     4.902742e-01
6             HRV         5.551286e-14   2.877576e-01     1.825408e-01
7              BP         1.241454e-01   3.471054e-01     2.514376e-01
8             QRS         9.731434e-43   8.101722e-06     6.553831e-21
9             HCM         1.785614e-03   7.176570e-01     8.548699e-01
10          LVEDV         4.580060e-02   1.187708e-03     5.521181e-03
11         LVEDVI         3.143288e-02   1.322513e-02     4.080442e-02
12           LVEF         4.456790e-03   1.406317e-01     2.420456e-01
13          LVESV         7.247199e-03   7.239969e-01     9.165433e-01
14         LVESVI         7.166318e-06   4.876455e-01     1.697590e-01
15             SV         6.753341e-07   6.827648e-01     8.366959e-01
16            SVI         3.751355e-04   9.185243e-01     7.460529e-01
17            GGE         2.658597e-02   7.995791e-01     6.491467e-01
18 Focal_Epilepsy         3.652661e-03   5.061986e-01     3.745069e-01

Since some do not meet ANOVA criteria, we go with nonparametric

perform_kruskal_wallis <- function(data, trait) {
  formula <- as.formula(paste(trait, "~ arrest_group"))
  kruskal_test_result <- kruskal.test(formula, data = data)
  
  # Extract p-value
  p_value <- kruskal_test_result$p.value
  
  # Create a summary table with the p-value
  data.frame(Trait = trait, 
             Kruskal_Wallis_p_value = p_value)
}


perform_dunn_test <- function(filtered_data, trait) {
  # Extract trait values and group assignments
  trait_values <- filtered_data[[trait]]
  groups <- filtered_data$arrest_group  

  # Perform Dunn's test
  dunn_test_result <- dunn.test(x = trait_values, g = groups, method = "bonferroni")
  
  # Extract comparison, test statistic, p-value, and significance
  comparisons <- dunn_test_result$comparisons
  Z_values <- dunn_test_result$Z
  p_values <- dunn_test_result$P.adjusted
  significant <- p_values < 0.05
  
  # Combine into a data frame
  dunn_test_df <- data.frame(Comparison = comparisons,
                             Z_value = Z_values,
                             P_value = p_values,
                             Significant = significant)
  
  dunn_test_df$Trait <- trait  

  dunn_test_df
}


# Perform Kruskal-Wallis test for each trait and store results
kruskal_results <- lapply(traits, function(trait) perform_kruskal_wallis(filtered_data, trait))

# Combine Kruskal-Wallis results into a single data frame
combined_kruskal_results <- do.call(rbind, kruskal_results)

# Print Kruskal-Wallis results
print("Kruskal-Wallis Test Results:")
[1] "Kruskal-Wallis Test Results:"
print(combined_kruskal_results)
            Trait Kruskal_Wallis_p_value
1      Heart_rate           4.382702e-03
2            LQTS           6.797534e-01
3     PR_interval           1.580135e-01
4             QTc           4.795650e-03
5            Afib           2.026287e-01
6             HRV           6.138060e-01
7              BP           1.279428e-05
8             QRS           7.181042e-08
9             HCM           6.125892e-01
10          LVEDV           3.302545e-01
11         LVEDVI           9.615777e-02
12           LVEF           5.040997e-04
13          LVESV           1.741015e-03
14         LVESVI           4.664870e-01
15             SV           5.332224e-02
16            SVI           5.284722e-04
17            GGE           3.575482e-01
18 Focal_Epilepsy           5.808578e-01
# Perform Dunn's test for each trait and store results
dunn_results <- lapply(traits, function(trait) perform_dunn_test(filtered_data, trait))
  Kruskal-Wallis rank sum test

data: trait_values and groups
Kruskal-Wallis chi-squared = 8.118, df = 1, p-value = 0

                     Comparison of trait_values by groups                      
                                 (Bonferroni)                                  
Col Mean-|
Row Mean |       Case
---------+-----------
 Control |  -2.849216
         |    0.0022*

alpha = 0.05
Reject Ho if p <= alpha/2
  Kruskal-Wallis rank sum test

data: trait_values and groups
Kruskal-Wallis chi-squared = 0.1704, df = 1, p-value = 0.68

                     Comparison of trait_values by groups                      
                                 (Bonferroni)                                  
Col Mean-|
Row Mean |       Case
---------+-----------
 Control |  -0.412799
         |     0.3399

alpha = 0.05
Reject Ho if p <= alpha/2
  Kruskal-Wallis rank sum test

data: trait_values and groups
Kruskal-Wallis chi-squared = 1.9931, df = 1, p-value = 0.16

                     Comparison of trait_values by groups                      
                                 (Bonferroni)                                  
Col Mean-|
Row Mean |       Case
---------+-----------
 Control |   1.411784
         |     0.0790

alpha = 0.05
Reject Ho if p <= alpha/2
  Kruskal-Wallis rank sum test

data: trait_values and groups
Kruskal-Wallis chi-squared = 7.9549, df = 1, p-value = 0

                     Comparison of trait_values by groups                      
                                 (Bonferroni)                                  
Col Mean-|
Row Mean |       Case
---------+-----------
 Control |  -2.820448
         |    0.0024*

alpha = 0.05
Reject Ho if p <= alpha/2
  Kruskal-Wallis rank sum test

data: trait_values and groups
Kruskal-Wallis chi-squared = 1.6233, df = 1, p-value = 0.2

                     Comparison of trait_values by groups                      
                                 (Bonferroni)                                  
Col Mean-|
Row Mean |       Case
---------+-----------
 Control |  -1.274097
         |     0.1013

alpha = 0.05
Reject Ho if p <= alpha/2
  Kruskal-Wallis rank sum test

data: trait_values and groups
Kruskal-Wallis chi-squared = 0.2547, df = 1, p-value = 0.61

                     Comparison of trait_values by groups                      
                                 (Bonferroni)                                  
Col Mean-|
Row Mean |       Case
---------+-----------
 Control |   0.504648
         |     0.3069

alpha = 0.05
Reject Ho if p <= alpha/2
  Kruskal-Wallis rank sum test

data: trait_values and groups
Kruskal-Wallis chi-squared = 19.041, df = 1, p-value = 0

                     Comparison of trait_values by groups                      
                                 (Bonferroni)                                  
Col Mean-|
Row Mean |       Case
---------+-----------
 Control |   4.363594
         |    0.0000*

alpha = 0.05
Reject Ho if p <= alpha/2
  Kruskal-Wallis rank sum test

data: trait_values and groups
Kruskal-Wallis chi-squared = 29.0153, df = 1, p-value = 0

                     Comparison of trait_values by groups                      
                                 (Bonferroni)                                  
Col Mean-|
Row Mean |       Case
---------+-----------
 Control |   5.386581
         |    0.0000*

alpha = 0.05
Reject Ho if p <= alpha/2
  Kruskal-Wallis rank sum test

data: trait_values and groups
Kruskal-Wallis chi-squared = 0.2564, df = 1, p-value = 0.61

                     Comparison of trait_values by groups                      
                                 (Bonferroni)                                  
Col Mean-|
Row Mean |       Case
---------+-----------
 Control |  -0.506381
         |     0.3063

alpha = 0.05
Reject Ho if p <= alpha/2
  Kruskal-Wallis rank sum test

data: trait_values and groups
Kruskal-Wallis chi-squared = 0.9479, df = 1, p-value = 0.33

                     Comparison of trait_values by groups                      
                                 (Bonferroni)                                  
Col Mean-|
Row Mean |       Case
---------+-----------
 Control |   0.973601
         |     0.1651

alpha = 0.05
Reject Ho if p <= alpha/2
  Kruskal-Wallis rank sum test

data: trait_values and groups
Kruskal-Wallis chi-squared = 2.7681, df = 1, p-value = 0.1

                     Comparison of trait_values by groups                      
                                 (Bonferroni)                                  
Col Mean-|
Row Mean |       Case
---------+-----------
 Control |  -1.663773
         |     0.0481

alpha = 0.05
Reject Ho if p <= alpha/2
  Kruskal-Wallis rank sum test

data: trait_values and groups
Kruskal-Wallis chi-squared = 12.1004, df = 1, p-value = 0

                     Comparison of trait_values by groups                      
                                 (Bonferroni)                                  
Col Mean-|
Row Mean |       Case
---------+-----------
 Control |  -3.478568
         |    0.0003*

alpha = 0.05
Reject Ho if p <= alpha/2
  Kruskal-Wallis rank sum test

data: trait_values and groups
Kruskal-Wallis chi-squared = 9.8043, df = 1, p-value = 0

                     Comparison of trait_values by groups                      
                                 (Bonferroni)                                  
Col Mean-|
Row Mean |       Case
---------+-----------
 Control |   3.131186
         |    0.0009*

alpha = 0.05
Reject Ho if p <= alpha/2
  Kruskal-Wallis rank sum test

data: trait_values and groups
Kruskal-Wallis chi-squared = 0.5303, df = 1, p-value = 0.47

                     Comparison of trait_values by groups                      
                                 (Bonferroni)                                  
Col Mean-|
Row Mean |       Case
---------+-----------
 Control |   0.728206
         |     0.2332

alpha = 0.05
Reject Ho if p <= alpha/2
  Kruskal-Wallis rank sum test

data: trait_values and groups
Kruskal-Wallis chi-squared = 3.7338, df = 1, p-value = 0.05

                     Comparison of trait_values by groups                      
                                 (Bonferroni)                                  
Col Mean-|
Row Mean |       Case
---------+-----------
 Control |  -1.932301
         |     0.0267

alpha = 0.05
Reject Ho if p <= alpha/2
  Kruskal-Wallis rank sum test

data: trait_values and groups
Kruskal-Wallis chi-squared = 12.0124, df = 1, p-value = 0

                     Comparison of trait_values by groups                      
                                 (Bonferroni)                                  
Col Mean-|
Row Mean |       Case
---------+-----------
 Control |   3.465893
         |    0.0003*

alpha = 0.05
Reject Ho if p <= alpha/2
  Kruskal-Wallis rank sum test

data: trait_values and groups
Kruskal-Wallis chi-squared = 0.8465, df = 1, p-value = 0.36

                     Comparison of trait_values by groups                      
                                 (Bonferroni)                                  
Col Mean-|
Row Mean |       Case
---------+-----------
 Control |   0.920046
         |     0.1788

alpha = 0.05
Reject Ho if p <= alpha/2
  Kruskal-Wallis rank sum test

data: trait_values and groups
Kruskal-Wallis chi-squared = 0.3048, df = 1, p-value = 0.58

                     Comparison of trait_values by groups                      
                                 (Bonferroni)                                  
Col Mean-|
Row Mean |       Case
---------+-----------
 Control |  -0.552132
         |     0.2904

alpha = 0.05
Reject Ho if p <= alpha/2
# Combine Dunn test results into a single data frame
combined_dunn_results <- do.call(rbind, dunn_results)

# Print Dunn Test results
print("Dunn's Test Post-Hoc Results:")
[1] "Dunn's Test Post-Hoc Results:"
print(combined_dunn_results)
       Comparison    Z_value      P_value Significant          Trait
1  Case - Control -2.8492167 2.191351e-03        TRUE     Heart_rate
2  Case - Control -0.4127996 3.398767e-01       FALSE           LQTS
3  Case - Control  1.4117842 7.900676e-02       FALSE    PR_interval
4  Case - Control -2.8204490 2.397825e-03        TRUE            QTc
5  Case - Control -1.2740978 1.013144e-01       FALSE           Afib
6  Case - Control  0.5046481 3.069030e-01       FALSE            HRV
7  Case - Control  4.3635942 6.397140e-06        TRUE             BP
8  Case - Control  5.3865814 3.590521e-08        TRUE            QRS
9  Case - Control -0.5063811 3.062946e-01       FALSE            HCM
10 Case - Control  0.9736014 1.651272e-01       FALSE          LVEDV
11 Case - Control -1.6637732 4.807888e-02        TRUE         LVEDVI
12 Case - Control -3.4785685 2.520498e-04        TRUE           LVEF
13 Case - Control  3.1311866 8.705075e-04        TRUE          LVESV
14 Case - Control  0.7282069 2.332435e-01       FALSE         LVESVI
15 Case - Control -1.9323020 2.666112e-02        TRUE             SV
16 Case - Control  3.4658937 2.642361e-04        TRUE            SVI
17 Case - Control  0.9200469 1.787741e-01       FALSE            GGE
18 Case - Control -0.5521322 2.904289e-01       FALSE Focal_Epilepsy
# Define the groups
metrics <- c("Heart_rate", "PR_interval", "QTc", "HRV", "BP", "QRS","LVEDV", "LVEDVI","LVEF","LVESV","LVESVI","SV","SVI")
syndromes <- c("LQTS", "Afib", "HCM","GGE","Focal_Epilepsy")

# Categorize each trait
combined_dunn_results$Category <- ifelse(combined_dunn_results$Trait %in% metrics, "Metrics",
                                         ifelse(combined_dunn_results$Trait %in% syndromes, "Syndromes", "Combined"))

# Reorder the levels of Trait based on the Category
combined_dunn_results <- combined_dunn_results %>%
    mutate(Trait = factor(Trait, levels = unique(Trait[order(Category)])))

# Transform P-values to -log10 scale
combined_dunn_results$NegLogP <- -log10(combined_dunn_results$P_value)

# Calculate the -log10(0.05) for the reference line
threshold <- -log10(0.05)



# Create a new variable for coloring based on P-value significance
combined_dunn_results$Significant <- ifelse(combined_dunn_results$P_value < 0.05, "Significant", "Not Significant")

# Number of unique comparisons
num_comparisons <- length(unique(combined_dunn_results$Comparison))



# Combine metrics and syndromes into a single ordered list
ordered_traits <- c("BP", "Heart_rate", "HRV", "PR_interval", "QTc","QRS",
                    "LVEDV", "LVEDVI", "LVEF", "LVESV", "LVESVI", "SV", "SVI",
                    "LQTS", "Afib", "HCM", "GGE", "Focal_Epilepsy","WT_global")

# Update the 'Trait' column to have this specific order
combined_dunn_results$Trait <- factor(combined_dunn_results$Trait,
                                      levels = ordered_traits)

# Replot with the correct order
p3 <- ggplot(combined_dunn_results, aes(x = Trait, y = NegLogP, color = -Z_value)) +
    geom_segment(aes(xend = Trait, yend = 0), size = 1) +  
    geom_point(size = 3) +
    geom_hline(yintercept = threshold, linetype = "dotted", color = "Black") +  
    scale_color_gradient2(low = "black", mid = "white", high = "#3C8C78", midpoint = 0) +  
    facet_wrap(~Comparison, scales = "free_x") +  
    theme_cowplot() +  
    theme(axis.text.x = element_text(angle = 90, hjust = 1),  
          strip.text.x = element_text(size = 10)) +  
    labs(y = "-log10(P-value)", x = "Trait", color = "Z-value", title = "Lollipop Plot of -log10(P-values) by Trait and Group Comparison")
Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
ℹ Please use `linewidth` instead.
This warning is displayed once every 8 hours.
Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
generated.
# Display the plots
print(p3)

# Function to generate plots for a given trait
generate_plots_for_trait <- function(data, trait, group_colors) {
  # Rank the scores for the specified trait
  data$rank <- rank(data[[trait]], na.last = "keep")
  
  # Density plot for the trait
  density_plot <- ggplot(data, aes(x = rank, fill = arrest_group, y = ..density..)) +
    geom_density(alpha = 0.6) +
    scale_fill_manual(values = group_colors) +
    labs(title = paste("Relative Density of Overall", trait, "Trait Ranks Across Arrest Groups"),
         x = paste("Overall Rank of", trait, "Scores"),
         y = "Relative Density") +
    theme_cowplot(12) +
    theme(axis.text.x = element_text(angle = 45, hjust = 1))
  
  # CDF plot for the trait
  cdf_plot <- ggplot(data, aes(x = rank, color = arrest_group)) +
    stat_ecdf(geom = "step", size = 1) +
    scale_color_manual(values = group_colors) +
    labs(title = paste("CDF of Overall", trait, "Trait Ranks Across Arrest Groups"),
         x = paste("Overall Rank of", trait, "Scores"),
         y = "Cumulative Proportion") +
    theme_cowplot(12) +
    theme(axis.text.x = element_text(angle = 45, hjust = 1))
  
  # Combine the plots
  combined_plot <- plot_grid(density_plot, cdf_plot, ncol = 1, align = 'v', labels = c("A", "B"))
  
  # Return the combined plot
  return(combined_plot)
}


# List of traits to plot
traits <- c("BP", "Afib", "QRS","LVEF","Heart_rate")

# Initialize an empty list to store plots
plots_list <- list()

# Loop through each trait and generate plots, storing them in the list
for(trait in traits) {
  plots_list[[trait]] <- generate_plots_for_trait(prs_data, trait, group_colors)
}
Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
ℹ Please use `after_stat(density)` instead.
This warning is displayed once every 8 hours.
Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
generated.
######## Access each plot by its trait name from the plots_list ##########
# For example, to print the plot for QRS, use:
print(plots_list[["QRS"]])

print(plots_list[["Heart_rate"]])

Coding******************************************************************************************************************

coding_stats <- read.table("/Users/tmt6052/Library/CloudStorage/OneDrive-NorthwesternUniversity/SFRN_paper_items/R_analysis/final_for_paper/statistics_by_individual_replication_for_manuscript.txt", header = TRUE, sep = "\t")

coding_stats_discovery <- coding_stats[coding_stats$Replication == 0, ]

Categorize the data

categorized_coding_stats_discovery <- coding_stats_discovery %>%
  mutate(Category_Group = case_when(
    Category == 1 ~ "Control",
    Category %in% 2:6 ~ "Case"
  ))

Merge some of the column data to get the desired AF intervals

categorized_coding_stats_discovery <- categorized_coding_stats_discovery %>%
  mutate(Epilepsy_01_0001 = (Epilepsy_missense_variant_0.01 + Epilepsy_missense_variant_0.001),
  Epilepsy_00001 = (Epilepsy_missense_variant_0.00001 + Epilepsy_missense_singleton),
  Epilepsy_total_missense = (Epilepsy_missense_common +Epilepsy_missense_variant_0.01 + Epilepsy_missense_variant_0.001 + Epilepsy_missense_variant_0.00001 +
                               Epilepsy_missense_singleton))

categorized_coding_stats_discovery <- categorized_coding_stats_discovery %>%
  mutate(Cardiomyopathy_01_00001 = (Cardiomyopathy_missense_variant_0.01 + Cardiomyopathy_missense_variant_0.001),
  Cardiomyopathy_00001 = (Cardiomyopathy_missense_variant_0.00001 +Cardiomyopathy_missense_singleton),
  Cardiomyopathy_total_missense = (Cardiomyopathy_missense_common + Cardiomyopathy_missense_variant_0.01 + Cardiomyopathy_missense_variant_0.001 +
                                     Cardiomyopathy_missense_variant_0.00001 +Cardiomyopathy_missense_singleton))

#clean it up to remove inadvertent NAs or infs
categorized_coding_stats_discovery <- categorized_coding_stats_discovery %>%
  dplyr::select(-ID, -Category) %>% 
  na.omit() %>% 
  dplyr::filter_all(all_vars(!is.infinite(.))) 

# Aggregate the Data to compute mean and standard deviation for each numeric column
collapsed_coding_stats <- categorized_coding_stats_discovery %>%
  group_by(Category_Group) %>%
  summarise(across(where(is.numeric), list(mean = ~mean(., na.rm = TRUE), 
                                           std = ~sd(., na.rm = TRUE))))

# clean it up to remove inadvertent NAs 
collapsed_coding_stats <- na.omit(collapsed_coding_stats)

Perform T-test for unequal variance

# Pull variables to analyze
coding_variables_to_analyze <- c(
  "Cardiomyopathy_HIGH", "Cardiomyopathy_start_lost", "Cardiomyopathy_stop_gained",
  "Cardiomyopathy_total_missense", "Cardiomyopathy_01_00001", "Cardiomyopathy_00001",
  "PLP_Cardiomyopathy", "Epilepsy_HIGH", "Epilepsy_start_lost", "Epilepsy_stop_gained",
  "Epilepsy_total_missense", "Epilepsy_01_0001", "Epilepsy_00001",
  "PLP_Epilepsy", "variant_count"
)

ttest_results <- list()

# Loop through each variable
for (var in coding_variables_to_analyze) {
  if (is.numeric(categorized_coding_stats_discovery[[var]])) {
    categorized_coding_stats_discovery$Category_Group <- as.factor(categorized_coding_stats_discovery$Category_Group)
    
    # Fit the ANOVA model to get residuals
    model <- aov(reformulate("Category_Group", response = var), data = categorized_coding_stats_discovery)
    residuals <- residuals(model)
    
    # Check if residuals are not constant
    if (length(unique(residuals)) > 1) {
      # Levene's Test for homogeneity of variances
      levene_test <- leveneTest(reformulate("Category_Group", response = var), data = categorized_coding_stats_discovery)
      levene_p_value <- levene_test$`Pr(>F)`[1]
      
      if (levene_p_value > 0.05) {
        # Perform t-test if homogeneity of variances assumption is met
        ttest <- t.test(reformulate("Category_Group", response = var), data = categorized_coding_stats_discovery, var.equal = TRUE)
      } else {
        # Perform Welch's t-test if homogeneity assumption is not met
        ttest <- t.test(reformulate("Category_Group", response = var), data = categorized_coding_stats_discovery, var.equal = FALSE)
      }
      ttest_results[[var]] <- broom::tidy(ttest)
    } else {
      # If residuals are constant, perform Welch's t-test
      ttest <- t.test(reformulate("Category_Group", response = var), data = categorized_coding_stats_discovery, var.equal = FALSE)
      ttest_results[[var]] <- broom::tidy(ttest)
    }
  } else {
    ttest_results[[var]] <- NA
  }
}

# Print t-test results
print("t-test Results:")
[1] "t-test Results:"
print(bind_rows(ttest_results, .id = "Variable"))
# A tibble: 15 × 11
   Variable   estimate estimate1 estimate2 statistic  p.value parameter conf.low
   <chr>         <dbl>     <dbl>     <dbl>     <dbl>    <dbl>     <dbl>    <dbl>
 1 Cardiomyo…  1.41e+0   2.77e+0   1.36e+0     5.64  2.74e- 8      575.  9.17e-1
 2 Cardiomyo…  3.39e-2   3.95e-2   5.59e-3     3.25  1.22e- 3      610.  1.34e-2
 3 Cardiomyo…  3.25e-1   3.72e-1   4.66e-2     5.01  7.46e- 7      525.  1.98e-1
 4 Cardiomyo… -5.48e+0   1.06e+2   1.11e+2    -4.95  8.74e- 7     1041  -7.66e+0
 5 Cardiomyo… -1.23e+0   1.32e+1   1.44e+1    -3.16  1.65e- 3     1041  -2.00e+0
 6 Cardiomyo…  4.83e-1   1.50e+0   1.02e+0     2.52  1.19e- 2      576.  1.07e-1
 7 PLP_Cardi…  1.70e-1   2.41e-1   7.08e-2     7.23  1.11e-12      791.  1.24e-1
 8 Epilepsy_…  8.57e-1   6.26e+0   5.41e+0     2.70  7.13e- 3      633.  2.34e-1
 9 Epilepsy_…  1.00e-2   1.19e-2   1.86e-3     1.94  5.34e- 2      653. -1.45e-4
10 Epilepsy_…  3.13e-1   3.95e-1   8.19e-2     3.96  8.63e- 5      530.  1.58e-1
11 Epilepsy_… -1.42e+0   6.73e+1   6.87e+1    -2.45  1.45e- 2      962. -2.56e+0
12 Epilepsy_… -1.09e+0   8.71e+0   9.80e+0    -4.59  4.95e- 6     1041  -1.56e+0
13 Epilepsy_…  5.16e-1   1.55e+0   1.03e+0     2.12  3.43e- 2      543.  3.82e-2
14 PLP_Epile…  1.33e-2   6.92e-2   5.59e-2     0.874 3.83e- 1     1041  -1.66e-2
15 variant_c… -2.82e+3   1.14e+5   1.17e+5    -4.74  2.41e- 6     1039. -3.99e+3
# ℹ 3 more variables: conf.high <dbl>, method <chr>, alternative <chr>

Create the Function to plot each column as a ratio to the means from Group 1

plot_column_ratio <- function(data, col_name) {
  # Calculate the mean and standard deviation for Control
  group1_mean <- mean(data[data$Category_Group == "Control", col_name], na.rm = TRUE)
  group1_sd <- sd(data[data$Category_Group == "Control", col_name], na.rm = TRUE)
  
  # Calculate the ratio for each group relative to Group 1
  data_summary <- data %>%
  group_by(Category_Group) %>%
  summarise(mean_value = mean(.data[[col_name]], na.rm = TRUE),
            sd_value = sd(.data[[col_name]], na.rm = TRUE), 
            n = n()) %>%
  mutate(ratio = mean_value / group1_mean,
         sem_value = sd_value / sqrt(n), 
         sem_ratio = sem_value / group1_mean, # SEM scaled to the group 1, just like the mean value
         ci_lower = ratio - (1.96 * sem_ratio), # Lower bound of the CI
         ci_upper = ratio + (1.96 * sem_ratio)) # Upper bound of the CI
  
  return(list(summary_data = data_summary))
}

Plot the data relative to group 1

# Initialize an empty dataframe to store summary data
combined_coding_stats_summary_df <- data.frame(Category_Group = character(), col_name = character(), mean = numeric(), std = numeric(), stringsAsFactors = FALSE)


# List of all columns to plot
columns_to_plot <- setdiff(names(categorized_coding_stats_discovery), c("ID", "Category", "Category_Group"))

# Loop through each column and plot relative to Category_Group1 (Crontrol)
for (col in columns_to_plot) {
  plot_data <- plot_column_ratio(categorized_coding_stats_discovery, col)
  # Append summary data to combined_coding_stats_summary_df
  combined_coding_stats_summary_df <- bind_rows(combined_coding_stats_summary_df, mutate(plot_data$summary_data, col_name = col))
}

Plot the data relative to group in a better way

# Subset the data for the specified variables
selected_coding_data <- combined_coding_stats_summary_df %>%
  filter(col_name %in% c("variant_count", 
                         "Cardiomyopathy_total_missense",
                         "Cardiomyopathy_01_00001",
                         "Cardiomyopathy_00001",
                         "Epilepsy_total_missense",
                         "Epilepsy_01_0001",
                         "Epilepsy_00001",
                         "PLP_Cardiomyopathy",
                         "PLP_Epilepsy",
                         "Cardiomyopathy_start_lost",
                         "Cardiomyopathy_stop_gained",
                         "Cardiomyopathy_HIGH",
                         "Epilepsy_start_lost",
                         "Epilepsy_HIGH",
                         "Epilepsy_stop_gained"
                         ),
         Category_Group != "Control") 


# Define the specific order for "Epilepsy" and CMAR variants
levels_order <- c("variant_count",
                  "PLP_Epilepsy",
                  "Epilepsy_00001",
                  "Epilepsy_01_0001",
                  "Epilepsy_total_missense",
                  "Epilepsy_start_lost",
                  "Epilepsy_stop_gained",
                  "Epilepsy_HIGH",
                  "PLP_Cardiomyopathy",
                  "Cardiomyopathy_00001",
                  "Cardiomyopathy_01_00001",
                  "Cardiomyopathy_total_missense",
                  "Cardiomyopathy_start_lost",
                  "Cardiomyopathy_stop_gained",
                  "Cardiomyopathy_HIGH"
)


selected_coding_data$col_name <- factor(selected_coding_data$col_name, levels = levels_order)

# Plot
coding_stats_plot <- ggplot(selected_coding_data, aes(y = col_name, x = ratio, color = Category_Group)) +
  geom_point(position = position_dodge(width = 1), size = 3) +
  geom_errorbar(aes(xmin = ratio - sem_ratio, xmax = ratio + sem_ratio), position = position_dodge(width = 0.5), width = 1) +
  geom_vline(xintercept = 1, linetype = "dashed") +
  scale_color_manual(values = "#3C8C78") +
  labs(title = "Ratio Compared to Control +/-SEM",
       y = "Variant",
       x = "Ratio to Control Mean",
       color = "Category Group") +
  theme_minimal() +
  theme(axis.text.x = element_text(size = 8), 
        axis.text.y = element_text(size = 8, hjust = 1),
        axis.title.x = element_text(size = 8), 
        axis.title.y = element_text(size = 8),
        legend.title = element_text(size = 8),
        legend.text = element_text(size = 8),
        plot.title = element_text(size = 8, hjust = 0.5),
        plot.subtitle = element_text(size = 8),
        plot.caption = element_text(size = 8),
        plot.margin = margin(15, 15, 15, 15)) +
  scale_x_continuous(limits = c(-2, 10))


print(coding_stats_plot)

Input the data

# Pull in the gene data
gene_data <- read.csv("/Users/tmt6052/Library/CloudStorage/OneDrive-NorthwesternUniversity/SFRN_paper_items/R_analysis/final_for_paper/individual_variants_by_gene_for_manuscript.csv")

# Read the cohort data
cohort <- read.table("/Users/tmt6052/Library/CloudStorage/OneDrive-NorthwesternUniversity/SFRN_paper_items/R_analysis/final_for_paper/SFRN_cohort_for_manuscript.txt", sep = "\t", header = TRUE)

# add the arrest category to each
gene_data$Category <- cohort$arrest_ontology[match(gene_data$ID, cohort$CGM_id)]

gene_data_discovery <- gene_data %>%
  filter(Set == "Discovery")

Summarize the data by GENE, counting the number of variants for each gene Filter for Category > 1 and then group and summarize

variants_per_gene_Cat_2_6 <- gene_data_discovery %>%
  filter(Category > 1) %>%
  group_by(GENE) %>%
  summarise(
    HIGH = sum(HIGH, na.rm = TRUE),
    PLP = sum(PLP, na.rm = TRUE),
    .groups = 'drop'
  )

# Print the result
print(variants_per_gene_Cat_2_6)
# A tibble: 361 × 3
   GENE    HIGH   PLP
   <chr>  <int> <int>
 1 A2ML1     57     0
 2 AARS      12     0
 3 ABAT       0     0
 4 ABCC9      4     0
 5 ACADVL     5     2
 6 ACTC1      1     0
 7 ACTL6B     0     0
 8 ACTN2     15     0
 9 ADAM22     3     0
10 ADSL       1     1
# ℹ 351 more rows

Filter for Category == 1 and then group and summarize

variants_per_gene_Cat_1 <- gene_data_discovery %>%
  filter(Category == 1) %>%
  group_by(GENE) %>%
  summarise(
    HIGH = sum(HIGH, na.rm = TRUE),
    PLP = sum(PLP, na.rm = TRUE),
    .groups = 'drop'
  )

# Print the result
print(variants_per_gene_Cat_1)
# A tibble: 361 × 3
   GENE    HIGH   PLP
   <chr>  <int> <int>
 1 A2ML1     69     0
 2 AARS       0     0
 3 ABAT       0     0
 4 ABCC9      3     0
 5 ACADVL     8     3
 6 ACTC1      0     0
 7 ACTL6B     0     2
 8 ACTN2      1     0
 9 ADAM22     0     0
10 ADSL       0     0
# ℹ 351 more rows

Load gene lists from text files

genes_CMAR1 <- readLines("/Users/tmt6052/Library/CloudStorage/OneDrive-NorthwesternUniversity/SFRN_paper_items/R_analysis/final_for_paper/SDY_CMAR1_list.txt")
genes_CMAR2 <- readLines("/Users/tmt6052/Library/CloudStorage/OneDrive-NorthwesternUniversity/SFRN_paper_items/R_analysis/final_for_paper/SDY_CMAR2_list.txt")
genes_EIEE_OMIM <- readLines("/Users/tmt6052/Library/CloudStorage/OneDrive-NorthwesternUniversity/SFRN_paper_items/R_analysis/final_for_paper/SDY_EIEE_OMIM_list.txt")
genes_Epilepsy <- readLines("/Users/tmt6052/Library/CloudStorage/OneDrive-NorthwesternUniversity/SFRN_paper_items/R_analysis/final_for_paper/SDY_Epilepsy_list.txt")

Annotate gene with source panel

genes_CMAR1 <- data.frame(Gene = genes_CMAR1, Source = "Cardiomyopathy")
genes_CMAR2 <- data.frame(Gene = genes_CMAR2, Source = "Cardiomyopathy")
genes_EIEE_OMIM <- data.frame(Gene = genes_EIEE_OMIM, Source = "Epilepsy")
genes_Epilepsy <- data.frame(Gene = genes_Epilepsy, Source = "Epilepsy")

# Replace "\"AARS1\"" with "AARS"
genes_EIEE_OMIM$Gene <- ifelse(genes_EIEE_OMIM$Gene == "\"AARS1\"", "AARS", genes_EIEE_OMIM$Gene)

# Replace "\"KIAA2022\"" with "NEXMIF"
genes_Epilepsy$Gene <- ifelse(genes_Epilepsy$Gene == "\"NEXMIF\"", "KIAA2022", genes_Epilepsy$Gene)

Append panel source to the gene_data

# Combine all lists into one dataframe
all_genes <- rbind(genes_CMAR1, genes_CMAR2, genes_EIEE_OMIM, genes_Epilepsy)

# Sort by Gene and Source to ensure "Cardiomyopathy" comes first
all_genes <- all_genes[order(all_genes$Gene, all_genes$Source), ]

# Remove duplicates, keeping the first occurrence
all_genes <- all_genes[!duplicated(all_genes$Gene), ]

# Clean the 'Gene' column in 'all_genes' by removing quotes and backslashes
all_genes$Gene <- gsub('\"', '', all_genes$Gene)

# Ensure that the 'GENE' column in 'gene_data_discovery' and 'Gene' in 'all_genes' are character type
gene_data_discovery$GENE <- as.character(gene_data_discovery$GENE)
all_genes$Gene <- as.character(all_genes$Gene)

# find the index of each gene in 'all_genes' and use this index to assign the corresponding 'Source' to a new column in 'gene_data'
gene_data_discovery$Source <- all_genes$Source[match(gene_data_discovery$GENE, all_genes$Gene)]

# Now, 'gene_data' will have a new column 'Source' with the source of each gene
# based on the lookup from 'all_genes'

# View the first few rows to verify the new 'Source' has been added
head(gene_data_discovery)
          ID   GENE HIGH PLP       Set Category         Source
1 CGM0000001  A2ML1    0   0 Discovery        1 Cardiomyopathy
2 CGM0000001   AARS    0   0 Discovery        1       Epilepsy
3 CGM0000001   ABAT    0   0 Discovery        1       Epilepsy
4 CGM0000001  ABCC9    0   0 Discovery        1 Cardiomyopathy
5 CGM0000001 ACADVL    0   0 Discovery        1 Cardiomyopathy
6 CGM0000001  ACTC1    0   0 Discovery        1 Cardiomyopathy

Add the source to the category 1 variants list

variants_per_gene_Cat_1$Source <- all_genes$Source[match(variants_per_gene_Cat_1$GENE, all_genes$Gene)]

head(variants_per_gene_Cat_1)
# A tibble: 6 × 4
  GENE    HIGH   PLP Source        
  <chr>  <int> <int> <chr>         
1 A2ML1     69     0 Cardiomyopathy
2 AARS       0     0 Epilepsy      
3 ABAT       0     0 Epilepsy      
4 ABCC9      3     0 Cardiomyopathy
5 ACADVL     8     3 Cardiomyopathy
6 ACTC1      0     0 Cardiomyopathy

Add the source to the category 2-6 variants list

variants_per_gene_Cat_2_6$Source <- all_genes$Source[match(variants_per_gene_Cat_2_6$GENE, all_genes$Gene)]

head(variants_per_gene_Cat_2_6)
# A tibble: 6 × 4
  GENE    HIGH   PLP Source        
  <chr>  <int> <int> <chr>         
1 A2ML1     57     0 Cardiomyopathy
2 AARS      12     0 Epilepsy      
3 ABAT       0     0 Epilepsy      
4 ABCC9      4     0 Cardiomyopathy
5 ACADVL     5     2 Cardiomyopathy
6 ACTC1      1     0 Cardiomyopathy

combine the variants together now

# Add a new column to indicate the category
variants_per_gene_Cat_1 <- variants_per_gene_Cat_1 %>%
  mutate(Category = "1")  

# Add a new column to indicate the category range
variants_per_gene_Cat_2_6 <- variants_per_gene_Cat_2_6 %>%
  mutate(Category = "2-6")  

# Combine the two datasets
combined_variants <- bind_rows(variants_per_gene_Cat_1, variants_per_gene_Cat_2_6)

# Print the combined dataset
print(combined_variants)
# A tibble: 722 × 5
   GENE    HIGH   PLP Source         Category
   <chr>  <int> <int> <chr>          <chr>   
 1 A2ML1     69     0 Cardiomyopathy 1       
 2 AARS       0     0 Epilepsy       1       
 3 ABAT       0     0 Epilepsy       1       
 4 ABCC9      3     0 Cardiomyopathy 1       
 5 ACADVL     8     3 Cardiomyopathy 1       
 6 ACTC1      0     0 Cardiomyopathy 1       
 7 ACTL6B     0     2 Epilepsy       1       
 8 ACTN2      1     0 Cardiomyopathy 1       
 9 ADAM22     0     0 Epilepsy       1       
10 ADSL       0     0 Epilepsy       1       
# ℹ 712 more rows
# Filter dataset where High > 0
combined_variants_High <- combined_variants %>% filter(HIGH > 0)

# Create a label function
custom_labeller <- function(variable, value) {
  if (variable == "Category") {
    return(ifelse(value == "1", "Control", "Case"))
  }
  return(value)
}

# Create the plot with custom labeller
High_variant_plot <- ggplot(combined_variants_High, aes(x = GENE, y = Category, fill = HIGH)) +
  geom_tile() + 
  scale_fill_gradientn(colors = c("#3C8C78", "#dcb43c", "#ae7e46"), 
                       values = scales::rescale(c(0, 0.5, 1))) + 
  facet_wrap(~ Source, scales = "free_x", ncol = 1) + 
  labs(title = "High Variants Heatmap",
       x = "Gene",
       y = "Category",
       fill = "Count") +
  theme_cowplot(12) + 
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1), 
        strip.background = element_blank(), 
        strip.placement = "outside") +
  scale_y_discrete(labels = function(x) custom_labeller("Category", x))

# Print the plot
print(High_variant_plot)

# Create a custom labeller function for PLP_plot
custom_labeller_plp <- function(variable, value) {
  if (variable == "Category") {
    return(ifelse(value == "1", "Control", "Case"))
  }
  return(value)
}

# Filter datasets where PLP > 0
combined_variants_PLP <- combined_variants %>% filter(PLP > 0)

# Create the PLP plot with custom labeller
PLP_plot <- ggplot(combined_variants_PLP, aes(x = GENE, y = Category, fill = PLP)) +
  geom_tile() + 
  scale_fill_gradientn(colors = c("#3C8C78", "#dcb43c", "#ae7e46"), 
                       values = scales::rescale(c(0, 0.5, 1))) + 
  facet_wrap(~ Source, scales = "free_x", ncol = 1) + 
  labs(title = "PLP Heatmap",
       x = "Gene",
       y = "Category",
       fill = "Count") +
  theme_cowplot(12) + 
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1), 
        strip.background = element_blank(), 
        strip.placement = "outside") +
  scale_y_discrete(labels = function(x) custom_labeller_plp("Category", x))

# Print the plot
print(PLP_plot)

Now bring in the module data

modules <- read_excel("/Users/tmt6052/Library/CloudStorage/OneDrive-NorthwesternUniversity/SFRN_paper_items/R_analysis/final_for_paper/cardio_gene_sets.xlsx")

annotate the gene_data wiht the modules

module_data <- left_join(gene_data_discovery, modules, by = c("GENE" = "Gene"))

# View the first few rows of the updated data
head(module_data)
          ID   GENE HIGH PLP       Set Category         Source
1 CGM0000001  A2ML1    0   0 Discovery        1 Cardiomyopathy
2 CGM0000001   AARS    0   0 Discovery        1       Epilepsy
3 CGM0000001   ABAT    0   0 Discovery        1       Epilepsy
4 CGM0000001  ABCC9    0   0 Discovery        1 Cardiomyopathy
5 CGM0000001 ACADVL    0   0 Discovery        1 Cardiomyopathy
6 CGM0000001  ACTC1    0   0 Discovery        1 Cardiomyopathy
             Module
1         Rasopathy
2              <NA>
3              <NA>
4 Membrane_polarity
5      mitochondria
6         sarcomere

NOW PLOT THE PLPs by their expert-defined categories. The size is the relative number of variants per gene in the category. the Color is the absolute number of variants

#Filter NAs ans aggregate the data
module_data <- module_data %>%
  filter(!is.na(Module))

module_data <- module_data %>%
  mutate(Category_Group = ifelse(Category == 1, "Category 1", "Categories 2-6"))

#count up the variants
module_summary <- module_data %>%
  group_by(Module, Category_Group) %>%
  summarise(Total_PLP_variant = sum(PLP, na.rm = TRUE)) %>%
  ungroup() 
`summarise()` has grouped output by 'Module'. You can override using the
`.groups` argument.
# First, calculate the number of genes per module
genes_per_module <- modules %>%
  group_by(Module) %>%
  summarise(Num_Genes = n()) %>%
  ungroup()

# Merge this information with your module_summary data frame
module_summary <- module_summary %>%
  left_join(genes_per_module, by = c("Module" = "Module"))

# Calculate the size relative to the number of genes per module
module_summary <- module_summary %>%
  mutate(Relative_Size = Total_PLP_variant / Num_Genes)

# Filter the data to only include "Categories 2-6"
module_summary_filtered <- module_summary %>%
  filter(Category_Group == "Categories 2-6")

module_order <- module_summary_filtered %>%
  group_by(Module) %>%
  summarise(Sort_Metric = mean(Total_PLP_variant, na.rm = TRUE)) %>%
  arrange(desc(Sort_Metric)) %>%
  .$Module

module_summary_filtered$Module <- factor(module_summary_filtered$Module, levels = module_order)

# Now plot with the reordered Module
modules_plot <- ggplot(module_summary_filtered, aes(x = Module, y = Category_Group, size = Total_PLP_variant, color = Relative_Size)) +
  geom_point(shape = 15, alpha = 1) +
  scale_size(range = c(3, 20), name = "Number of PLPs") + 
  scale_color_gradient(low = "Grey", high = "#05618F", name = "# of PLPs relative to Module size") +
  theme_minimal() +
  labs(title = "Total PLP by Module (Cases)",
       x = "Module",
       y = "") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

print(modules_plot)

#Count the High effect variants
aggregated_data <- module_data %>%
  group_by(ID, Module, Category) %>%
  summarise(High_count = sum(HIGH),
            .groups = 'drop')

#split off controls
data_cat_1 <- aggregated_data %>%
  filter(Category %in% c(1))

#identify IDs with any 'High_count' greater than 0
ids_with_both_1 <- data_cat_1 %>%
  group_by(ID) %>%
  summarise(High_count_any = any(High_count > 0)) %>%
  filter(High_count_any) %>%
  dplyr::select(ID)

# Filter to include only rows that have IDs with High_count > 0
modules_with_both_1 <- data_cat_1 %>%
  semi_join(ids_with_both_1, by = "ID")

#find all possible pairs of modules where 'High_count' is greater than 0
# and generate a new df with each row representing a unique pair of modules for each ID.
module_pairs_for_ids_1 <- modules_with_both_1 %>%
  group_by(ID) %>%
  do({
    data.frame(t(combn(unique(.$Module), 2)), stringsAsFactors = FALSE)
  }) %>%
  dplyr::rename(Module1 = X1, Module2 = X2) %>%
  left_join(modules_with_both_1, by = c("ID", "Module1" = "Module")) %>%
  left_join(modules_with_both_1, by = c("ID", "Module2" = "Module"), suffix = c("_1", "_2")) %>%
  filter(High_count_2 > 0 | High_count_1 > 0)

# Count occurrences of module pairs across IDs
module_pair_counts_1 <- module_pairs_for_ids_1 %>%
  group_by(Module1, Module2) %>%
  summarise(Count = n_distinct(ID), .groups = 'drop')



#split off cases
data_cat_2_6 <- aggregated_data %>%
  filter(Category %in% c(2,3,4,5,6))

#identify IDs with any 'High_count' greater than 0
ids_with_both_2_6 <- data_cat_2_6 %>%
  group_by(ID) %>%
  summarise(Missense_any = any(High_count > 0)) %>%
  filter(Missense_any) %>%
  dplyr::select(ID)

# Filter to include only rows that have IDs with High_count > 0
modules_with_both_2_6 <- data_cat_2_6 %>%
  semi_join(ids_with_both_2_6, by = "ID")


#find all possible pairs of modules where 'High_count' is greater than 0
# and generate a new df with each row representing a unique pair of modules for each ID.
module_pairs_for_ids_2_6 <- modules_with_both_2_6 %>%
  group_by(ID) %>%
  do({
    data.frame(t(combn(unique(.$Module), 2)), stringsAsFactors = FALSE)
  }) %>%
  dplyr::rename(Module1 = X1, Module2 = X2) %>%
  left_join(modules_with_both_2_6, by = c("ID", "Module1" = "Module")) %>%
left_join(modules_with_both_2_6, by = c("ID", "Module2" = "Module"), suffix = c("_1", "_2")) %>%
  filter(High_count_2 > 0 | High_count_1 > 0)

# Count occurrences of module pairs across IDs
module_pair_counts_2_6 <- module_pairs_for_ids_2_6 %>%
  group_by(Module1, Module2) %>%
  summarise(Count = n_distinct(ID), .groups = 'drop')


# Ensure all combinations are present with at least a zero count for each category
comparison_df <- merge(module_pair_counts_1, module_pair_counts_2_6, by = c("Module1", "Module2"), all = TRUE)
comparison_df[is.na(comparison_df)] <- 0  # Replace NA with 0

# Rename columns
names(comparison_df)[names(comparison_df) == "Count.x"] <- "Count_1"
names(comparison_df)[names(comparison_df) == "Count.y"] <- "Count_2_6"

# Add number of people per group for totals not in counts
comparison_df$Not_Count_1 <- sum(cohort$arrest_ontology == "1") - comparison_df$Count_1
comparison_df$Not_Count_2_6 <- sum(cohort$arrest_ontology %in% c(2,3,4,5,6)) - comparison_df$Count_2_6

# Function to perform Fisher's Exact Test for a row
perform_fisher_test <- function(count_1, not_count_1, count_2_6, not_count_2_6) {
  contingency_table <- matrix(c(count_1, not_count_1, count_2_6, not_count_2_6), 
                              nrow = 2, 
                              dimnames = list(c("In Count", "Not in Count"), c("Group_1", "Group_2_6")))
  
  fisher_result <- fisher.test(contingency_table)
  
  return(fisher_result$p.value)
}

# Apply the Fisher function to each row in the dataframe
comparison_df$p_value <- mapply(perform_fisher_test, 
                                comparison_df$Count_1, 
                                comparison_df$Not_Count_1, 
                                comparison_df$Count_2_6, 
                                comparison_df$Not_Count_2_6)



comparison_df$adjusted_p_value <- p.adjust(comparison_df$p_value, method = "bonferroni", n = length(comparison_df$p_value))

# Filter significant results based on adjusted p-values
significant_pairs <- comparison_df %>% filter(adjusted_p_value < 0.05)

# Print significant module pairs
print(significant_pairs)
              Module1            Module2 Count_1 Count_2_6 Not_Count_1
1                 DGC fate_specification       5        31         591
2                 DGC                ICD      28        61         568
3                 DGC  Membrane_polarity      14        44         582
4                 DGC   nuclear_envelope       2        18         594
5                 DGC       Proteostasis       4        25         592
6                 DGC          sarcomere     164       216         432
7                 DGC             Z_disc      11        35         585
8  fate_specification  lysosomal_storage       6        27         590
9  fate_specification   nuclear_envelope       3        26         593
10 fate_specification          sarcomere     164       226         432
11                ICD fate_specification      29        67         567
12                ICD  Membrane_polarity      38        78         558
13                ICD       Proteostasis      28        63         568
14                ICD          sarcomere     180       239         416
15                ICD             Z_disc      35        72         561
16  lysosomal_storage          sarcomere     165       215         431
17  Membrane_polarity fate_specification      15        50         581
18  Membrane_polarity  lysosomal_storage      15        40         581
19  Membrane_polarity   nuclear_envelope      12        37         584
20  Membrane_polarity       Proteostasis      14        45         582
21  Membrane_polarity          sarcomere     171       229         425
22  Membrane_polarity             Z_disc      21        54         575
23       mitochondria          sarcomere     173       222         423
24   nuclear_envelope          sarcomere     162       214         434
25       Proteostasis fate_specification       5        34         591
26       Proteostasis   nuclear_envelope       2        17         594
27       Proteostasis          sarcomere     162       220         434
28       Proteostasis             Z_disc      11        36         585
29             Z_disc fate_specification      12        44         584
30             Z_disc          sarcomere     166       220         430
   Not_Count_2_6      p_value adjusted_p_value
1            542 4.031980e-06     3.669102e-04
2            512 1.510745e-04     1.374778e-02
3            529 2.256059e-05     2.053014e-03
4            555 1.554685e-04     1.414764e-02
5            548 3.749909e-05     3.412417e-03
6            357 2.261894e-04     2.058324e-02
7            538 2.235224e-04     2.034054e-02
8            546 1.283913e-04     1.168361e-02
9            547 4.819768e-06     4.385989e-04
10           347 1.803934e-05     1.641580e-03
11           506 2.585037e-05     2.352384e-03
12           495 3.524117e-05     3.206946e-03
13           510 7.019329e-05     6.387589e-03
14           334 4.336333e-05     3.946063e-03
15           501 7.010650e-05     6.379692e-03
16           358 3.674011e-04     3.343350e-02
17           523 2.792575e-06     2.541243e-04
18           533 2.937979e-04     2.673561e-02
19           536 1.939491e-04     1.764936e-02
20           528 1.395707e-05     1.270093e-03
21           344 5.971621e-05     5.434175e-03
22           519 3.894246e-05     3.543764e-03
23           351 5.261606e-04     4.788062e-02
24           359 2.177828e-04     1.981824e-02
25           539 6.695766e-07     6.093147e-05
26           556 2.916050e-04     2.653605e-02
27           353 4.930944e-05     4.487159e-03
28           537 1.392532e-04     1.267204e-02
29           529 6.591986e-06     5.998707e-04
30           353 1.455038e-04     1.324084e-02
#merge the high effect count data 
High_data <- rbind(data_cat_1, data_cat_2_6)

#assign the case/control categories
High_data <- High_data %>%
  mutate(Category_group = case_when(
    Category == 1 ~ "1",
    TRUE ~ "2-6"
  ))

#assign in the module gene count data
df_gene_counts <- data.frame(
  Module = c("CICR", "DGC", "ICD", "Membrane_polarity", "Proteostasis", 
             "Rasopathy", "SNS_PNS", "Z_disc", "cytokine", 
             "fate_specification", "lysosomal_storage", "mitochondria", 
             "nuclear_envelope", "sarcomere"),
  Num_Genes = c(11, 7, 12, 24, 7, 17, 10, 12, 4, 11, 3, 25, 4, 16)
)

# scale the high effect date per module based on the genes in the module
High_data_scaled <- High_data %>%
  left_join(df_gene_counts, by = "Module")

High_data_scaled <- High_data_scaled %>%
  mutate(High_count_per_gene = High_count / Num_Genes)

#compute the means
averages_sem_scaled <- High_data_scaled %>%
  group_by(Module, Category_group) %>%
  summarize(
    Mean = mean(High_count_per_gene),
    SD = sd(High_count_per_gene),
    N = n(),
    SEM = SD / sqrt(N),
    .groups = 'drop'
  )

#run the t-tests to compare the modules per case or control (Category_group)
test_results <- High_data_scaled %>%
  group_by(Module) %>%
  do({
    ttest <- t.test(High_count_per_gene ~ Category_group, data = .)
    tidy(ttest)
  }) %>%
  ungroup()  

# show t-test data
print(test_results)
# A tibble: 14 × 11
   Module      estimate estimate1 estimate2 statistic p.value parameter conf.low
   <chr>          <dbl>     <dbl>     <dbl>     <dbl>   <dbl>     <dbl>    <dbl>
 1 CICR        -0.0145   0.0193     0.0338     -2.28  2.31e-2      920. -0.0270 
 2 DGC         -0.00427  0.000532   0.00480    -3.07  2.24e-3      584. -0.00700
 3 ICD         -0.00785  0.00714    0.0150     -2.14  3.29e-2      865. -0.0151 
 4 Membrane_p… -0.00698  0.00101    0.00799    -3.90  1.09e-4      534. -0.0105 
 5 Proteostas… -0.00511  0.000532   0.00565    -3.06  2.32e-3      559. -0.00840
 6 Rasopathy   -0.00212  0.00800    0.0101     -1.51  1.32e-1      996. -0.00487
 7 SNS_PNS     -0.00561  0.00447    0.0101     -1.90  5.75e-2      616. -0.0114 
 8 Z_disc      -0.00519  0.00140    0.00659    -3.24  1.28e-3      596. -0.00834
 9 cytokine     0.0141   0.100      0.0860      1.73  8.40e-2     1005. -0.00190
10 fate_speci… -0.00848  0.000508   0.00898    -3.39  7.63e-4      519. -0.0134 
11 lysosomal_… -0.00143  0.00186    0.00329    -0.788 4.31e-1      937. -0.00500
12 mitochondr… -0.00157  0.00112    0.00269    -2.11  3.52e-2      701. -0.00303
13 nuclear_en… -0.00296  0          0.00296    -1.90  5.77e-2      505  -0.00603
14 sarcomere   -0.0414   0.0244     0.0658     -6.10  1.89e-9      582. -0.0547 
# ℹ 3 more variables: conf.high <dbl>, method <chr>, alternative <chr>
#calculate averages
averages_sem_scaled <- High_data_scaled %>%
  group_by(Module, Category_group) %>%
  summarize(
    Mean = mean(High_count_per_gene),
    SD = sd(High_count_per_gene),
    N = n(),
    SEM = SD / sqrt(N),
    .groups = 'drop'
  )

# label function for the fill legend
custom_fill_labels <- c("1" = "Control", "2-6" = "Case")

# Plot
High_modules_plot_scaled <- ggplot(averages_sem_scaled, aes(x = Module, y = Mean, fill = Category_group, group = Category_group)) +
  geom_bar(stat = "identity", position = position_dodge(width = 0.8), color = "black") +
  geom_errorbar(aes(ymin = Mean - SEM, ymax = Mean + SEM), width = .2, position = position_dodge(.8)) +
  scale_fill_manual(values = c("1" = "#FF6464", "2-6" = "#05618F"), labels = custom_fill_labels) +
  labs(x = "Module", y = "Average High Count Per Gene") +
  theme_cowplot() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Show plot
print(High_modules_plot_scaled)

#compute the relative counts based on cohort size
comparison_df <- comparison_df %>%
  mutate(Relative_Counts = (Count_2_6 / (sum(cohort$arrest_ontology %in% c(2,3,4,5,6))))/(Count_1 / (sum(cohort$arrest_ontology == "1"))))

#assign in the module gene count data
df_gene_counts <- data.frame(
  Module = c("CICR", "DGC", "ICD", "Membrane_polarity", "Proteostasis", 
             "Rasopathy", "SNS_PNS", "Z_disc", "cytokine", 
             "fate_specification", "lysosomal_storage", "mitochondria", 
             "nuclear_envelope", "sarcomere"),
  Num_Genes = c(11, 7, 12, 24, 7, 17, 10, 12, 4, 11, 3, 25, 4, 16)
)
significant_edges_df <- comparison_df %>%
  filter(adjusted_p_value < 0.05)

# Create an igraph object with edge attributes using filtered dataframe
network_graph <- graph_from_data_frame(d = significant_edges_df[, c("Module1", "Module2")], directed = FALSE)

# Matching Num_Genes from df_gene_counts to vertices in the network_graph
V(network_graph)$Num_Genes <- df_gene_counts$Num_Genes[match(V(network_graph)$name, df_gene_counts$Module)]

# Add edge attributes for adjusted_p_value and Relative_Counts from the filtered data frame
E(network_graph)$adjusted_p_value <- significant_edges_df$adjusted_p_value
E(network_graph)$Relative_Counts <- significant_edges_df$Relative_Counts


# Plot the graph with ggraph
HIGH_network <- ggraph(network_graph, layout = 'fr') +
  geom_edge_link(aes(color = -log10(adjusted_p_value), edge_width = Relative_Counts), alpha = 0.8) +
  geom_node_point(aes(size = Num_Genes), color = "Black") +
  geom_node_text(aes(label = name), repel = TRUE, point.padding = unit(0.01, "lines")) +
  scale_size_continuous(range = c(3, 10)) +
  theme_graph()

print(HIGH_network)
Warning: The `trans` argument of `continuous_scale()` is deprecated as of ggplot2 3.5.0.
ℹ Please use the `transform` argument instead.
This warning is displayed once every 8 hours.
Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
generated.

#Filter NAs ans aggregate the data
module_data <- module_data %>%
  filter(!is.na(Module))

module_data <- module_data %>%
  mutate(Category_Group = ifelse(Category == 1, "Category 1", "Categories 2-6"))

#count up the variants
module_summary <- module_data %>%
  group_by(Module, Category_Group) %>%
  summarise(Total_HIGH_variant = sum(HIGH, na.rm = TRUE)) %>%
  ungroup() 
`summarise()` has grouped output by 'Module'. You can override using the
`.groups` argument.
# First, calculate the number of genes per module
genes_per_module <- modules %>%
  group_by(Module) %>%
  summarise(Num_Genes = n()) %>%
  ungroup()

# Merge this information with your module_summary data frame
module_summary <- module_summary %>%
  left_join(genes_per_module, by = c("Module" = "Module"))

# Calculate the size relative to the number of genes per module
module_summary <- module_summary %>%
  mutate(Relative_Size = Total_HIGH_variant / Num_Genes)

# Filter the data to only include "Categories 2-6"
module_summary_filtered <- module_summary %>%
  filter(Category_Group == "Categories 2-6")

#order them by mean count
module_order <- module_summary_filtered %>%
  group_by(Module) %>%
  summarise(Sort_Metric = mean(Total_HIGH_variant, na.rm = TRUE)) %>%
  arrange(desc(Sort_Metric)) %>%
  .$Module

module_summary_filtered$Module <- factor(module_summary_filtered$Module, levels = module_order)

# Now plot with the reordered Module
modules_HIGH_plot <- ggplot(module_summary_filtered, aes(x = Module, y = Category_Group, size = Total_HIGH_variant, color = Relative_Size)) +
  geom_point(shape = 15, alpha = 1) +
  scale_size(range = c(3, 20), name = "Number of HIGHs") + 
  scale_color_gradient(low = "Grey", high = "#05618F", name = "# of HIGHs relative to Module size") +
  theme_minimal() +
  labs(title = "Total HIGH by Module (Cases)",
       x = "Module",
       y = "") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

print(modules_HIGH_plot)

Pull in uniprot data and plot PLP variants

# pull down data for MYH7 (uniprot ID P12883)
MYH7_data <- drawProteins::get_features("P12883")
[1] "Download has worked"
MYH7_data <- drawProteins::feature_to_dataframe(MYH7_data)

# pull down data for MYBPC3 (uniprot ID Q14896)
MYBPC3_data <- drawProteins::get_features("Q14896")
[1] "Download has worked"
MYBPC3_data <- drawProteins::feature_to_dataframe(MYBPC3_data)

# pull down data for SCN5A (uniprot ID Q14524)
SCN5A_data <- drawProteins::get_features("Q14524")
[1] "Download has worked"
SCN5A_data <- drawProteins::feature_to_dataframe(SCN5A_data)
MYH7_plot <- draw_canvas(MYH7_data)
MYH7_plot <- draw_chains(MYH7_plot, MYH7_data)
MYH7_plot <- draw_domains(MYH7_plot, MYH7_data)
MYH7_plot <- draw_regions(MYH7_plot, MYH7_data)
#MYH7_plot <- draw_phospho(MYH7_plot, MYH7_data)

variants <- data.frame(
  begin = c(1712,1057,908,904,869,741,723,576,369,355),
  y = rep(1, 10)  
)


# Now, add these points to your plot
MYH7_plot <- MYH7_plot + 
  geom_point(data = variants, aes(x = begin, y = y), color = "red", size = 3)

# Adjust themes and aesthetics as needed
MYH7_plot <- MYH7_plot + theme_bw(base_size = 14) + 
  theme(panel.grid.minor = element_blank(), 
        panel.grid.major = element_blank(),
        axis.ticks = element_blank(), 
        axis.text.y = element_blank(),
        panel.border = element_blank())

MYH7_plot

##################################################################################################
MYBPC3_plot <- draw_canvas(MYBPC3_data)
MYBPC3_plot <- draw_chains(MYBPC3_plot, MYBPC3_data)
MYBPC3_plot <- draw_domains(MYBPC3_plot, MYBPC3_data)
MYBPC3_plot <- draw_regions(MYBPC3_plot, MYBPC3_data)
#MYBPC3_plot <- draw_phospho(MYBPC3_plot, MYBPC3_data)

#######Note, 7 variants are not shown becasue they are splice variants

variants <- data.frame(
  begin = c(1098, 1096, 1064, 791, 542, 502, 495, 258, 219),
  y = rep(1, 9)  
)

# Now, add these points to your plot
MYBPC3_plot <- MYBPC3_plot + 
  geom_point(data = variants, aes(x = begin, y = y), color = "red", size = 3)

# Adjust themes and aesthetics as needed
MYBPC3_plot <- MYBPC3_plot + theme_bw(base_size = 14) + 
  theme(panel.grid.minor = element_blank(), 
        panel.grid.major = element_blank(),
        axis.ticks = element_blank(), 
        axis.text.y = element_blank(),
        panel.border = element_blank())

MYBPC3_plot

##################################################################################################
SCN5A_plot <- draw_canvas(SCN5A_data)
SCN5A_plot <- draw_chains(SCN5A_plot, SCN5A_data)
SCN5A_plot <- draw_domains(SCN5A_plot, SCN5A_data)
SCN5A_plot <- draw_regions(SCN5A_plot, SCN5A_data)
#SCN5A_plot <- draw_phospho(SCN5A_plot, SCN5A_data)

#######Note, 2 variants are not shown becasue they are splice variants
variants <- data.frame(
  begin = c(1784,1595,1319,1316,845,828,814),
  y = rep(1, 7)  
)


# Now, add these points to your plot
SCN5A_plot <- SCN5A_plot + 
  geom_point(data = variants, aes(x = begin, y = y), color = "red", size = 3)

# Adjust themes as needed
SCN5A_plot <- SCN5A_plot + theme_bw(base_size = 14) + 
  theme(panel.grid.minor = element_blank(), 
        panel.grid.major = element_blank(),
        axis.ticks = element_blank(), 
        axis.text.y = element_blank(),
        panel.border = element_blank())


SCN5A_plot

Nonoding******************************************************************************************************************

Pull in the file that is just the ATAC data intervals overlaid on top of the Hi-C intervals which map to the genes of interest

# Read the PC-Hi-C ATAC overlay BED file
bed_data <- read_tsv("/Users/tmt6052/Library/CloudStorage/OneDrive-NorthwesternUniversity/SFRN_paper_items/R_analysis/SFRN_cohort_data/noncoding/noncoding_rare/EP_ATAC_intersect_final_output_nodups.bed", col_names = FALSE, col_types = cols(
  X1 = col_character(), # Chromosome
  X2 = col_double(),    # Start Position
  X3 = col_double()     # End Position
))
# Calculate Interval Sizes
bed_data$interval_size <- bed_data$X3 - bed_data$X2

# Calculate the median of interval sizes
median_interval_size <- median(bed_data$interval_size, na.rm = TRUE)

# plot
interval_size_histogram <- ggplot(bed_data, aes(x = interval_size)) +
  geom_histogram(binwidth = 100, fill = "lightblue", color = "black") +
  geom_vline(aes(xintercept = median_interval_size), color = "black", linetype = "dashed", size = 1) +
  xlab("Interval Size") +
  ylab("Frequency") +
  ggtitle("Histogram of Interval Sizes") +
  theme_cowplot(12)

interval_size_histogram

mean(bed_data$interval_size)
[1] 497.4085
median(bed_data$interval_size)
[1] 396

Now pull in the ATAC data that maps to genes of interest, including the promoter fragment the ATAC data mapped to

# Read the BED file
bed_data <- read_tsv("/Users/tmt6052/Library/CloudStorage/OneDrive-NorthwesternUniversity/SFRN_paper_items/R_analysis/final_for_paper/EP_ATAC_intersect_final_output_annotated_unique.bed", col_names = c("Chromosome", "Start", "End", "Chromosome2", "GeneStart", "GeneEnd", "GeneNames"), col_types = cols(
  Chromosome = col_character(),
  Start = col_double(),
  End = col_double(),
  Chromosome2 = col_character(),
  GeneStart = col_double(),
  GeneEnd = col_double(),
  GeneNames = col_character()
))

# Split the gene names in case there are multiple genes separated by semicolon
bed_data <- bed_data %>% 
  mutate(GeneNames = strsplit(GeneNames, ";")) %>%
  unnest(GeneNames)

# Read the gene list
gene_list <- readLines("/Users/tmt6052/Library/CloudStorage/OneDrive-NorthwesternUniversity/SFRN_paper_items/R_analysis/final_for_paper/Lorenzo_panel_genes_simple.txt")

# Filter bed_data to keep only rows with genes in the gene list
bed_data <- bed_data %>%
  filter(GeneNames %in% gene_list)

# Count the number of regions mapped to each gene
gene_count <- bed_data %>%
  count(GeneNames)

print(gene_count)
# A tibble: 347 × 2
   GeneNames     n
   <chr>     <int>
 1 A2ML1        15
 2 ABAT         45
 3 ABCC9        35
 4 ACADVL        3
 5 ACTC1         9
 6 ACTL6B       13
 7 ACTN2        21
 8 ADAM22        3
 9 ADSL         35
10 AGL          59
# ℹ 337 more rows
# Calculate the median
median_regions <- median(gene_count$n, na.rm = TRUE)

# plot
regions_per_gene <- ggplot(gene_count, aes(x = n)) +
  geom_histogram(binwidth = 1, fill = "#B40F20", color = "#B40F20") +
  geom_vline(aes(xintercept = median_regions), color = "black", linetype = "dashed", size = 1) +
  xlab("Number of Regions") +
  ylab("Frequency") +
  theme_cowplot(12) +
  ggtitle("Histogram of the Distribution of Number of Regions per Gene")

regions_per_gene

mean(gene_count)

mean(gene_count$n)
[1] 31.40922
median(gene_count$n)
[1] 24
bed_data$region_size <- bed_data$End - bed_data$Start

# Sum region sizes for each gene
total_region_size_per_gene <- bed_data %>%
  group_by(GeneNames) %>%
  summarise(TotalRegionSize = sum(region_size))

mean(total_region_size_per_gene$TotalRegionSize)
[1] 16239.12
median(total_region_size_per_gene$TotalRegionSize)
[1] 12010
# Create histogram of Total Region Sizes
ggplot(total_region_size_per_gene, aes(x = TotalRegionSize)) +
  geom_histogram(binwidth = 1000, fill = "blue", color = "black") +
  xlab("Total Region Size") +
  ylab("Frequency") +
  ggtitle("Histogram of Total Region Sizes per Gene")

gene_list <- unique(bed_data$GeneNames)

# Select hg19 from Ensembl 
ensembl <- useMart("ensembl", dataset = "hsapiens_gene_ensembl", host = "grch37.ensembl.org")
Warning: Ensembl will soon enforce the use of https.
Ensure the 'host' argument includes "https://"
# Get TSS positions along with strand information
tss_info <- getBM(attributes = c('hgnc_symbol', 'chromosome_name', 'transcription_start_site', 'strand'),
                  filters = 'hgnc_symbol',
                  values = gene_list,
                  mart = ensembl)
# Make function to select the most upstream TSS based on strand
select_upstream_tss <- function(df) {
  if (all(df$strand == 1)) {
    return(data.frame(transcription_start_site = min(df$transcription_start_site)))  # Forward strand
  } else {
    return(data.frame(transcription_start_site = max(df$transcription_start_site)))  # Reverse strand
  }
}

# Apply the function to each group of genes
tss_upstream <- tss_info %>%
  group_by(hgnc_symbol) %>%
  do(select_upstream_tss(.))


# Merge bed_data with tss_upstream on gene names
merged_data <- merge(bed_data, tss_upstream, by.x = "GeneNames", by.y = "hgnc_symbol")

# Calculate the midpoint (peak center) of each interval
merged_data$Center <- (merged_data$Start + merged_data$End) / 2

# Calculate the distance
merged_data$DistanceToTSS <- abs(merged_data$Center - merged_data$transcription_start_site)

Distance to TSS

mean(merged_data$DistanceToTSS)
[1] 264213.1
median(merged_data$DistanceToTSS)
[1] 176294
# Create a histogram
median_distance <- median(merged_data$DistanceToTSS, na.rm = TRUE)

dist_tss <- ggplot(merged_data, aes(x = DistanceToTSS)) +
  geom_histogram(binwidth = 0.1, fill = "#FFB3BA", color = "black") +
  geom_vline(aes(xintercept = median_distance), color = "black", linetype = "dashed", size = 1) +
  xlab("Distance to TSS (bp)") +
  ylab("Frequency") +
  theme_cowplot(12) +
  ggtitle("Histogram of Distances to Transcription Start Sites (TSS)") +
  scale_x_log10(breaks = 10^(0:6) * 10000, labels = scales::comma)

dist_tss

# Retrieve TSS information for all genes along with HGNC symbols
all_tss_info <- getBM(
  attributes = c('hgnc_symbol', 'chromosome_name', 'transcription_start_site', 'strand'),
  mart = ensembl
)

# Filter out rows where hgnc_symbol is blank
filtered_all_tss_info <- all_tss_info %>%
  filter(hgnc_symbol != "")

See if the peak is closer to another TSS than the contacted one!

# Function to select the most upstream TSS along with chromosome information based on strand, just like before, but for all genes
select_upstream_tss <- function(df) {
  if (nrow(df) == 0) {
    return(data.frame(chromosome_name = NA, transcription_start_site = NA)) # Return NA if there are no rows
  }
  if (all(df$strand == 1)) {
    # Forward strand
    tss <- min(df$transcription_start_site, na.rm = TRUE)
  } else {
    # Reverse strand
    tss <- max(df$transcription_start_site, na.rm = TRUE)
  }
  chromosome <- df$chromosome_name[which.min(abs(df$transcription_start_site - tss))]
  return(data.frame(chromosome_name = chromosome, transcription_start_site = tss))
}

# Apply the function to each group of genes
all_tss_upstream <- filtered_all_tss_info %>%
  group_by(hgnc_symbol) %>%
  do(select_upstream_tss(.)) %>%
  ungroup() 

# Prepend 'chr' to chromosome names
all_tss_upstream$chromosome_name <- paste0("chr", all_tss_upstream$chromosome_name)
# Create GRanges object for all_tss_upstream
tss_ranges <- GRanges(
  seqnames = all_tss_upstream$chromosome_name,
  ranges = IRanges(start = all_tss_upstream$transcription_start_site, end = all_tss_upstream$transcription_start_site)
)

# Create GRanges object for merged_data
merged_data_ranges <- GRanges(
  seqnames = merged_data$Chromosome,
  ranges = IRanges(start = merged_data$Center, end = merged_data$Center)
)

# Find the nearest TSS for each midpoint (peak center)
nearest_hits <- nearest(merged_data_ranges, tss_ranges)

# Extract the closest TSS information
closest_tss_info <- all_tss_upstream[nearest_hits, ]

# Add closest TSS information to merged_data
merged_data$closest_gene <- closest_tss_info$hgnc_symbol
merged_data$closest_tss <- closest_tss_info$transcription_start_site
merged_data$closest_tss_chromosome <- closest_tss_info$chromosome_name

# Calculate distance to closest TSS
merged_data$distance_to_closest_tss <- abs(merged_data$Center - merged_data$closest_tss)
# Compare GeneNames with closest_gene and append
merged_data$gene_match <- merged_data$GeneNames == merged_data$closest_gene
# Calculate the fraction of matches
fraction_match <- sum(merged_data$gene_match, na.rm = TRUE) / nrow(merged_data)

fraction_mismatch <- 1-(sum(merged_data$gene_match, na.rm = TRUE) / nrow(merged_data))

# Print the fraction
print(100*fraction_match)
[1] 13.26727
print(100*fraction_mismatch)
[1] 86.73273

Lets see which genes have regions mapped to them that are closer to other TSSs

# Aggregate data by GeneNames
gene_summary <- merged_data %>%
  group_by(GeneNames) %>%
  summarize(
    any_true = any(gene_match, na.rm = TRUE),
    any_false = any(!gene_match, na.rm = TRUE),
    all_true = all(gene_match, na.rm = TRUE),
    all_false = all(!gene_match, na.rm = TRUE)
  ) %>%
  ungroup()

# Adjusted calculations to include 8 missing genes
adjusted_denominator <- nrow(gene_summary) + 16

percent_any_true <- sum(gene_summary$any_true) / adjusted_denominator * 100
number_any_true <- sum(gene_summary$any_true)
percent_any_false <- sum(gene_summary$any_false) / adjusted_denominator * 100
number_any_false <- sum(gene_summary$any_false) 
percent_all_true <- sum(gene_summary$all_true) / adjusted_denominator * 100
number_all_true <- sum(gene_summary$all_true) 
percent_all_false <- sum(gene_summary$all_false) / adjusted_denominator * 100
number_all_false <- sum(gene_summary$all_false) 
percent_both_true_false <- sum(gene_summary$any_true & gene_summary$any_false) / adjusted_denominator * 100
number_both_true_false <- sum(gene_summary$any_true & gene_summary$any_false) 

# Print the results
cat("Total genes in panel", 363 ,"\n")
Total genes in panel 363 
cat("Genes with no ATAC peaks:", 100*(16/363),"%, ",16,"\n")
Genes with no ATAC peaks: 4.407713 %,  16 
cat("Genes with at least one TRUE:", percent_any_true,"%, ", number_any_true,"\n")
Genes with at least one TRUE: 42.69972 %,  155 
cat("Genes with at least one FALSE:", percent_any_false,"%, ", number_any_false,"\n")
Genes with at least one FALSE: 93.66391 %,  340 
cat("Genes with only TRUE:", percent_all_true,"%, ", number_all_true,"\n")
Genes with only TRUE: 1.928375 %,  7 
cat("Genes with only FALSE:", percent_all_false,"%, ", number_all_false,"\n")
Genes with only FALSE: 52.89256 %,  192 
cat("Genes with both TRUE and FALSE:", percent_both_true_false,"%, ", number_both_true_false,"\n")
Genes with both TRUE and FALSE: 40.77135 %,  148 

input the count data

noncoding_stats <- read.table("/Users/tmt6052/Library/CloudStorage/OneDrive-NorthwesternUniversity/SFRN_paper_items/R_analysis/final_for_paper/noncoding_frequency_by_individual_for_manuscript.txt", header = TRUE, sep = "\t")

Categorize the data

categorized_noncoding_stats <- noncoding_stats %>%
  mutate(Category_Group = case_when(
    Category == 1 ~ "Control",
    #Category %in% 2:3 ~ "2-3",
    Category %in% 2:6 ~ "Case"
  ))

Aggregate and clean

#clean it up to remove inadvertent NAs or infs
categorized_noncoding_stats <- categorized_noncoding_stats %>%
  dplyr::select(-ID, -Category) %>% 
  na.omit() %>% 
  dplyr::filter_all(all_vars(!is.infinite(.))) 

# Aggregate the Data to compute mean and standard deviation for each numeric column
collapsed_noncoding_stats <- categorized_noncoding_stats %>%
  group_by(Category_Group) %>%
  summarise(across(where(is.numeric), list(mean = ~mean(., na.rm = TRUE), 
                                           std = ~sd(., na.rm = TRUE))))

# Remove rows with NA values
collapsed_noncoding_stats <- na.omit(collapsed_noncoding_stats)

Perform ANOVA with Tukey

# List of variables to analyze
noncoding_variables_to_analyze <- c("variant_count", "Epilepsy_gnomAD.0.001","Epilepsy_ultrarare","Cardiomyopathy_gnomAD.0.001","Cardiomyopathy_ultrarare")

ttest_results <- list()

# Loop through each variable
for (var in noncoding_variables_to_analyze) {
  if (is.numeric(categorized_noncoding_stats[[var]])) {
    categorized_noncoding_stats$Category_Group <- as.factor(categorized_noncoding_stats$Category_Group)
    
    # Fit the ANOVA model to get residuals
    model <- aov(reformulate("Category_Group", response = var), data = categorized_noncoding_stats)
    residuals <- residuals(model)
    
    # Check if residuals are not constant
    if (length(unique(residuals)) > 1) {
      # Levene's Test for homogeneity of variances
      levene_test <- leveneTest(reformulate("Category_Group", response = var), data = categorized_noncoding_stats)
      levene_p_value <- levene_test$`Pr(>F)`[1]
      
      if (levene_p_value > 0.05) {
        # Perform t-test if homogeneity of variances assumption is met
        ttest <- t.test(reformulate("Category_Group", response = var), data = categorized_noncoding_stats, var.equal = TRUE)
      } else {
        # Perform Welch's t-test if homogeneity assumption is not met
        ttest <- t.test(reformulate("Category_Group", response = var), data = categorized_noncoding_stats, var.equal = FALSE)
      }
      ttest_results[[var]] <- broom::tidy(ttest)
    } else {
      # If residuals are constant, perform Welch's t-test
      ttest <- t.test(reformulate("Category_Group", response = var), data = categorized_noncoding_stats, var.equal = FALSE)
      ttest_results[[var]] <- broom::tidy(ttest)
    }
  } else {
    ttest_results[[var]] <- NA
  }
}

# Print t-test results
print("t-test Results:")
[1] "t-test Results:"
print(bind_rows(ttest_results, .id = "Variable"))
# A tibble: 5 × 11
  Variable    estimate estimate1 estimate2 statistic  p.value parameter conf.low
  <chr>          <dbl>     <dbl>     <dbl>     <dbl>    <dbl>     <dbl>    <dbl>
1 variant_co…  -108.      3759.    3867.       -5.54 3.82e- 8     1041.  -146.  
2 Epilepsy_g…    -1.52      46.2     47.7      -1.96 5.02e- 2     1041     -3.04
3 Epilepsy_u…     8.62      20.5     11.9       5.15 3.75e- 7      520.     5.33
4 Cardiomyop…    -4.73      38.3     43.0      -6.79 1.90e-11     1041     -6.10
5 Cardiomyop…     7.00      16.6      9.64      5.80 1.11e- 8      529.     4.63
# ℹ 3 more variables: conf.high <dbl>, method <chr>, alternative <chr>

Create the Function to plot each column as a ratio to the means from Group 1

# Function to plot each column as a ratio to the means from Group 1
plot_column_ratio <- function(data, col_name) {
  # Calculate the mean and standard deviation for Group 1
  group1_mean <- mean(data[data$Category_Group == "Control", col_name], na.rm = TRUE)
  group1_sd <- sd(data[data$Category_Group == "Control", col_name], na.rm = TRUE)
  
  # Calculate the ratio for each group relative to Group 1
  data_summary <- data %>%
  group_by(Category_Group) %>%
  summarise(mean_value = mean(.data[[col_name]], na.rm = TRUE),
            sd_value = sd(.data[[col_name]], na.rm = TRUE), 
            n = n()) %>%
  mutate(ratio = mean_value / group1_mean,
         sem_value = sd_value / sqrt(n), #  SEM calculation
         sem_ratio = sem_value / group1_mean, # SEM ratio 
         ci_lower = ratio - (1.96 * sem_ratio), # Lower bound of the CI
         ci_upper = ratio + (1.96 * sem_ratio)) # Upper bound of the CI

  return(list(summary_data = data_summary))
}
# List of all columns to plot
columns_to_plot <- setdiff(names(categorized_noncoding_stats), c("ID", "Category", "Category_Group"))


# Initialize an empty dataframe to store summary data
combined_noncoding_stats_summary_df <- data.frame(Category_Group = character(), col_name = character(), mean = 
                                                    numeric(), std = numeric(), stringsAsFactors = FALSE)

# Loop through each column and plot relative to Category_Group1
for (col in columns_to_plot) {
  plot_data <- plot_column_ratio(categorized_noncoding_stats, col)
  # Append summary data to combined_noncoding_stats_summary_df
  combined_noncoding_stats_summary_df <- bind_rows(combined_noncoding_stats_summary_df, mutate(plot_data$summary_data, col_name = col))
}
# Subset the data for the specified variables
selected_noncoding_data <- combined_noncoding_stats_summary_df %>%
  filter(col_name %in% c(
                       "variant_count",
                       "Cardiomyopathy_gnomAD.0.001",
                        "Epilepsy_gnomAD.0.001",
                        "Cardiomyopathy_ultrarare",
                       "Epilepsy_ultrarare"),
         Category_Group != "Control") 


# Define the specific order for noncoding variants
levels_order <- c(
                      "variant_count",
                      "Epilepsy_ultrarare",
                      "Epilepsy_gnomAD.0.001",
                      "Cardiomyopathy_ultrarare",
                      "Cardiomyopathy_gnomAD.0.001"
                       )

# Factorize the 'col_name' column with the specified order
selected_noncoding_data$col_name <- factor(selected_noncoding_data$col_name, levels = levels_order)

# Plot 
noncoding_stats_plot <- ggplot(selected_noncoding_data, aes(y = col_name, x = ratio, color = Category_Group)) +
  geom_point(position = position_dodge(width = 1), size = 3) +
  geom_errorbar(aes(xmin = ratio - sem_ratio, xmax = ratio + sem_ratio), position = position_dodge(width = 0.5), width = 0.5) +
  geom_vline(xintercept = 1, linetype = "dashed") +
  scale_color_manual(values = "#FF6464") +
  labs(title = "Ratio Compared to Group 1 +/-SEM",
       y = "Variant",
       x = "Ratio to Group 1 Mean",
       color = "Category Group") +
  theme_minimal() +
  theme(axis.text.x = element_text(size = 8), 
        axis.text.y = element_text(size = 8, hjust = 1),
        axis.title.x = element_text(size = 8), 
        axis.title.y = element_text(size = 8),
        legend.title = element_text(size = 8),
        legend.text = element_text(size = 8),
        plot.title = element_text(size = 8, hjust = 0.5),
        plot.subtitle = element_text(size = 8),
        plot.caption = element_text(size = 8),
        plot.margin = margin(15, 15, 15, 15)) +
  scale_x_continuous(limits = c(0.75, 2))


print(noncoding_stats_plot)

Input the data (change to your path)

# Pull in the gene data
gene_data_noncoding <- read.csv("/Users/tmt6052/Library/CloudStorage/OneDrive-NorthwesternUniversity/SFRN_paper_items/R_analysis/final_for_paper/individual_noncoding_variants_by_gene_for_manuscript.csv")

# Read the cohort data
cohort <- read.table("/Users/tmt6052/Library/CloudStorage/OneDrive-NorthwesternUniversity/SFRN_paper_items/R_analysis/final_for_paper/SFRN_cohort_for_manuscript.txt", sep = "\t", header = TRUE)

# add the arrest category to each
gene_data_noncoding$Category <- cohort$arrest_ontology[match(gene_data_noncoding$ID, cohort$CGM_id)]

#filter for discovery only
gene_data_noncoding_discovery <- gene_data_noncoding %>%
  filter(Set == "Discovery")

Summarize the data by GENE, counting the number of variants for each gene Filter for Category > 1 and then group and summarize

variants_per_gene_Cat_2_6 <- gene_data_noncoding_discovery %>%
  filter(Category > 1) %>%
  group_by(GENE) %>%
  summarise(
    noncoding_ultrarare = sum(noncoding_ultrarare, na.rm = TRUE),
    .groups = 'drop'
  )

# Print the result
print(variants_per_gene_Cat_2_6)
# A tibble: 424 × 2
   GENE       noncoding_ultrarare
   <chr>                    <int>
 1 A2ML1                       99
 2 AAMP                        10
 3 ABAT                        68
 4 ABCC9                        0
 5 AC011551.3                   0
 6 ACADVL                      28
 7 ACTC1                       31
 8 ACTL6B                     183
 9 ACTN2                       28
10 ADAM22                       0
# ℹ 414 more rows

Filter for Category == 1 and then group and summarize

variants_per_gene_Cat_1 <- gene_data_noncoding_discovery %>%
  filter(Category == 1) %>%
  group_by(GENE) %>%
  summarise(
    noncoding_ultrarare = sum(noncoding_ultrarare, na.rm = TRUE),
    .groups = 'drop'
  )

# Print the result
print(variants_per_gene_Cat_1)
# A tibble: 424 × 2
   GENE       noncoding_ultrarare
   <chr>                    <int>
 1 A2ML1                       37
 2 AAMP                         1
 3 ABAT                        50
 4 ABCC9                        0
 5 AC011551.3                   0
 6 ACADVL                       9
 7 ACTC1                        7
 8 ACTL6B                     130
 9 ACTN2                       16
10 ADAM22                       0
# ℹ 414 more rows

Append panel source to the gene_data

# Ensure that the 'GENE' column in 'gene_data_noncoding_discovery' and 'Gene' in 'all_genes' are character type
gene_data_noncoding_discovery$GENE <- as.character(gene_data_noncoding_discovery$GENE)
all_genes$Gene <- as.character(all_genes$Gene)

# Find the index of each gene in 'all_genes'
# Then, use this index to assign the corresponding 'Source' to a new column in 'gene_data'
gene_data_noncoding_discovery$Source <- all_genes$Source[match(gene_data_noncoding_discovery$GENE, all_genes$Gene)]

# Now, 'gene_data' will have a new column 'Source' with the source of each gene
# based on the lookup from 'all_genes'

# View the first few rows to verify the new 'Source' has been added
head(gene_data_noncoding_discovery)
          ID         GENE noncoding_ultrarare       Set Category         Source
1 CGM0000969       RANGRF                   0 Discovery        2 Cardiomyopathy
2 CGM0000969          AGL                   0 Discovery        2 Cardiomyopathy
3 CGM0000969        RNF13                   0 Discovery        2       Epilepsy
4 CGM0000969       PRKAG2                   0 Discovery        2 Cardiomyopathy
5 CGM0000969         JPH2                   1 Discovery        2 Cardiomyopathy
6 CGM0000969 RP11-156P1.2                   0 Discovery        2           <NA>

Add the source to the category 1 variants list

variants_per_gene_Cat_1$Source <- all_genes$Source[match(variants_per_gene_Cat_1$GENE, all_genes$Gene)]

head(variants_per_gene_Cat_1)
# A tibble: 6 × 3
  GENE       noncoding_ultrarare Source        
  <chr>                    <int> <chr>         
1 A2ML1                       37 Cardiomyopathy
2 AAMP                         1 <NA>          
3 ABAT                        50 Epilepsy      
4 ABCC9                        0 Cardiomyopathy
5 AC011551.3                   0 <NA>          
6 ACADVL                       9 Cardiomyopathy

Add the source to the category 2-6 variants list

variants_per_gene_Cat_2_6$Source <- all_genes$Source[match(variants_per_gene_Cat_2_6$GENE, all_genes$Gene)]

head(variants_per_gene_Cat_2_6)
# A tibble: 6 × 3
  GENE       noncoding_ultrarare Source        
  <chr>                    <int> <chr>         
1 A2ML1                       99 Cardiomyopathy
2 AAMP                        10 <NA>          
3 ABAT                        68 Epilepsy      
4 ABCC9                        0 Cardiomyopathy
5 AC011551.3                   0 <NA>          
6 ACADVL                      28 Cardiomyopathy

combine the variants together now

# Add a new column to indicate the category
variants_per_gene_Cat_1 <- variants_per_gene_Cat_1 %>%
  mutate(Category = "1")  

# Add a new column to indicate the category range
variants_per_gene_Cat_2_6 <- variants_per_gene_Cat_2_6 %>%
  mutate(Category = "2-6")  

# Combine the two datasets
combined_variants <- bind_rows(variants_per_gene_Cat_1, variants_per_gene_Cat_2_6)

# Print the combined dataset
print(combined_variants)
# A tibble: 848 × 4
   GENE       noncoding_ultrarare Source         Category
   <chr>                    <int> <chr>          <chr>   
 1 A2ML1                       37 Cardiomyopathy 1       
 2 AAMP                         1 <NA>           1       
 3 ABAT                        50 Epilepsy       1       
 4 ABCC9                        0 Cardiomyopathy 1       
 5 AC011551.3                   0 <NA>           1       
 6 ACADVL                       9 Cardiomyopathy 1       
 7 ACTC1                        7 Cardiomyopathy 1       
 8 ACTL6B                     130 Epilepsy       1       
 9 ACTN2                       16 Cardiomyopathy 1       
10 ADAM22                       0 Epilepsy       1       
# ℹ 838 more rows
# Filter dataset where noncoding_ultrarare > 0
noncoding_ultrarare_data <- combined_variants %>%
  filter(noncoding_ultrarare > 0) %>%
  filter(!is.na(Source))


# Label function for noncoding_ultrarare plot
custom_labeller_noncoding <- function(variable, value) {
  if (variable == "Category") {
    return(ifelse(value == "1", "Control", "Case"))
  }
  return(value)
}

# Create the noncoding_ultrarare plot
noncoding_ultrarare_plot <- ggplot(noncoding_ultrarare_data, aes(x = GENE, y = Category, fill = noncoding_ultrarare)) +
  geom_tile() + 
  scale_fill_gradientn(colors = c("#3C8C78", "#dcb43c", "#ae7e46"), 
                       values = scales::rescale(c(0, 0.5, 1))) + 
  facet_wrap(~ Source, scales = "free_x", ncol = 1) + 
  labs(title = "Noncoding Ultrarare Heatmap",
       x = "Gene",
       y = "Category",
       fill = "Count") +
  theme_cowplot(12) + 
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1), 
        strip.background = element_blank(), 
        strip.placement = "outside") +
  scale_y_discrete(labels = function(x) custom_labeller_noncoding("Category", x))

# Print the plot
print(noncoding_ultrarare_plot)

Combined******************************************************************************************************************

# Read the cohort data
total_data <- read.table('/Users/tmt6052/Library/CloudStorage/OneDrive-NorthwesternUniversity/SFRN_paper_items/R_analysis/final_for_paper/combined_data_for_manuscript.txt', header = TRUE, sep = '\t', stringsAsFactors = FALSE)


combined_data <- total_data %>% filter(Set =="Discovery") 

replication_data <- total_data %>%  filter(Set =="Replication") 

Categorize the Data based on original categories and arrest status

#Clump by category
categorized_combined_data <- combined_data %>%
  mutate(Category = case_when(
    arrest_group == 1 ~ "Control",
    arrest_group %in% 2:6 ~ "zCase"
  )) %>%
  filter(!is.na(Category))

#Convert Category to a factor
categorized_combined_data$Category <- as.factor(categorized_combined_data$Category)

############################ ############## ############## replication
#Clump replication by category z used for formatting reasons I am too lazy to fix
categorized_replication_data <- replication_data %>%
  mutate(Category = case_when(
    arrest_group == 1 ~ "Control",
    arrest_group %in% 2:6 ~ "zCase"
  )) %>%
  filter(!is.na(Category))

# Convert Category to a factor
categorized_replication_data$Category <- as.factor(categorized_replication_data$Category)

Collapse the rare vars

categorized_combined_data <- categorized_combined_data %>%
  mutate(Epilepsy_rare = (Epilepsy_missense_variant_0.01 + Epilepsy_missense_variant_0.001),
  Epilepsy_ultrarare = (Epilepsy_missense_variant_0.00001 + Epilepsy_missense_singleton))

categorized_combined_data <- categorized_combined_data %>%
  mutate(Epilepsy_null = (Epilepsy_start_lost + Epilepsy_stop_gained))

categorized_combined_data <- categorized_combined_data %>%
  mutate(Cardiomyopathy_rare = (Cardiomyopathy_missense_variant_0.01 + Cardiomyopathy_missense_variant_0.001),
  Cardiomyopathy_ultrarare = (Cardiomyopathy_missense_variant_0.00001 + Cardiomyopathy_missense_singleton))

categorized_combined_data <- categorized_combined_data %>%
  mutate(Cardiomyopathy_null = (Cardiomyopathy_start_lost + Cardiomyopathy_stop_gained))

categorized_combined_data <- categorized_combined_data %>% 
  mutate(Epilepsy_noncoding_rare = (Epilepsy_noncoding_gnomad_0.001 + Epilepsy_noncoding_cohort_0.01 + Epilepsy_noncoding_cohort_singleton))
         
categorized_combined_data <- categorized_combined_data %>% 
  mutate(Cardiomyopathy_noncoding_rare = (Cardiomyopathy_noncoding_gnomad_0.001 + Cardiomyopathy_noncoding_cohort_0.01 + Cardiomyopathy_noncoding_cohort_singleton))


############################ ############## ############## replication
categorized_replication_data <- categorized_replication_data %>%
  mutate(Epilepsy_rare = (Epilepsy_missense_variant_0.01 + Epilepsy_missense_variant_0.001),
  Epilepsy_ultrarare = (Epilepsy_missense_variant_0.00001 + Epilepsy_missense_singleton))

categorized_replication_data <- categorized_replication_data %>%
  mutate(Epilepsy_null = (Epilepsy_start_lost + Epilepsy_stop_gained))

categorized_replication_data <- categorized_replication_data %>%
  mutate(Cardiomyopathy_rare = (Cardiomyopathy_missense_variant_0.01 + Cardiomyopathy_missense_variant_0.001),
  Cardiomyopathy_ultrarare = (Cardiomyopathy_missense_variant_0.00001 + Cardiomyopathy_missense_singleton))

categorized_replication_data <- categorized_replication_data %>%
  mutate(Cardiomyopathy_null = (Cardiomyopathy_start_lost + Cardiomyopathy_stop_gained))

categorized_replication_data <- categorized_replication_data %>%
  mutate(Epilepsy_noncoding_rare = (Epilepsy_noncoding_gnomad_0.001 + Epilepsy_noncoding_cohort_0.01 + Epilepsy_noncoding_cohort_singleton))

categorized_replication_data <- categorized_replication_data %>%
  mutate(Cardiomyopathy_noncoding_rare = (Cardiomyopathy_noncoding_gnomad_0.001 + Cardiomyopathy_noncoding_cohort_0.01 + Cardiomyopathy_noncoding_cohort_singleton))

Perform multivariate Logistic Regression on everything

model <- glm(Category ~ Normalized_Heart_rate + Normalized_PR_interval + Normalized_QTc + Normalized_BP + Normalized_QRS + Normalized_LVEF + Normalized_LVESV + Normalized_SVI + Normalized_Afib + Epilepsy_rare + Epilepsy_ultrarare + Cardiomyopathy_rare + Cardiomyopathy_ultrarare  + PLP_Epilepsy + PLP_Cardiomyopathy + Cardiomyopathy_noncoding_rare  + Epilepsy_noncoding_rare + Cardiomyopathy_null + Epilepsy_null, 
             data = categorized_combined_data, family = binomial())

model

Call:  glm(formula = Category ~ Normalized_Heart_rate + Normalized_PR_interval + 
    Normalized_QTc + Normalized_BP + Normalized_QRS + Normalized_LVEF + 
    Normalized_LVESV + Normalized_SVI + Normalized_Afib + Epilepsy_rare + 
    Epilepsy_ultrarare + Cardiomyopathy_rare + Cardiomyopathy_ultrarare + 
    PLP_Epilepsy + PLP_Cardiomyopathy + Cardiomyopathy_noncoding_rare + 
    Epilepsy_noncoding_rare + Cardiomyopathy_null + Epilepsy_null, 
    family = binomial(), data = categorized_combined_data)

Coefficients:
                  (Intercept)          Normalized_Heart_rate  
                    -0.685379                      -0.081183  
       Normalized_PR_interval                 Normalized_QTc  
                     0.080199                      -0.057222  
                Normalized_BP                 Normalized_QRS  
                     0.280711                       0.405413  
              Normalized_LVEF               Normalized_LVESV  
                    -0.183223                       0.100668  
               Normalized_SVI                Normalized_Afib  
                     0.016921                      -0.110002  
                Epilepsy_rare             Epilepsy_ultrarare  
                    -0.041446                      -0.011901  
          Cardiomyopathy_rare       Cardiomyopathy_ultrarare  
                    -0.025813                      -0.006447  
                 PLP_Epilepsy             PLP_Cardiomyopathy  
                    -0.223524                       1.218792  
Cardiomyopathy_noncoding_rare        Epilepsy_noncoding_rare  
                    -0.010962                       0.026840  
          Cardiomyopathy_null                  Epilepsy_null  
                     0.921670                       0.425896  

Degrees of Freedom: 1042 Total (i.e. Null);  1023 Residual
Null Deviance:      1445 
Residual Deviance: 1235     AIC: 1275

Compute the scores

# tell it how to make the scores
scores <- predict(model, type = "link")

scores_replication <- predict(model, newdata = categorized_replication_data,  type = "link")

# Add scores to the dataframes
categorized_combined_data$scores <- scores
categorized_replication_data$scores_replication <- scores_replication

Perform multivariate Logistic Regressions based only on the input paramaters, and subsets

model_cardiomyopathy_rare <- glm(Category ~ PLP_Cardiomyopathy + Cardiomyopathy_rare + Cardiomyopathy_ultrarare + Cardiomyopathy_null, data = categorized_combined_data, family = binomial())

model_epilepsy_rare <- glm(Category ~ PLP_Epilepsy + Epilepsy_rare + Epilepsy_ultrarare + Epilepsy_null, data = categorized_combined_data, family = binomial())

model_noncoding_rare <- glm(Category ~ Epilepsy_noncoding_rare + Cardiomyopathy_noncoding_rare, data = categorized_combined_data, family = binomial())

model_common <- glm(Category ~ Normalized_Heart_rate + Normalized_PR_interval + Normalized_QTc + Normalized_BP + Normalized_QRS + Normalized_LVEF + Normalized_LVESV + Normalized_SVI + Normalized_Afib, 
                    data = categorized_combined_data, family = binomial())

model_coding_rare <- glm(Category ~ PLP_Cardiomyopathy + Cardiomyopathy_rare + Cardiomyopathy_ultrarare + PLP_Epilepsy + Epilepsy_rare + Epilepsy_ultrarare + Cardiomyopathy_null + Epilepsy_null, data = categorized_combined_data, family = binomial())

model_epilepsy_and_common <- glm(Category ~ PLP_Epilepsy + Epilepsy_rare + Epilepsy_ultrarare + Epilepsy_null + Normalized_Heart_rate + Normalized_PR_interval + Normalized_QTc + Normalized_BP + Normalized_QRS + Normalized_LVEF + Normalized_LVESV + Normalized_SVI + Normalized_Afib, data = categorized_combined_data, family = binomial())

model_cardiac_and_common <- glm(Category ~ PLP_Cardiomyopathy + Cardiomyopathy_rare + Cardiomyopathy_ultrarare + Cardiomyopathy_null + Normalized_Heart_rate + Normalized_PR_interval + Normalized_QTc + Normalized_BP + Normalized_QRS + Normalized_LVEF + Normalized_LVESV + Normalized_SVI + Normalized_Afib, data = categorized_combined_data, family = binomial())

model_common_and_coding <- glm(Category ~ Normalized_Heart_rate + Normalized_PR_interval + Normalized_QTc + Normalized_BP + Normalized_QRS + Normalized_LVEF + Normalized_LVESV + Normalized_SVI + Normalized_Afib + Epilepsy_rare + Epilepsy_ultrarare + Cardiomyopathy_rare + Cardiomyopathy_ultrarare  + PLP_Epilepsy + PLP_Cardiomyopathy + Cardiomyopathy_null + Epilepsy_null, 
             data = categorized_combined_data, family = binomial())

Compute the rest of the scores and determine rank-based quintiles for all scores

categorized_combined_data$cardiac_variant_score <- predict(model_cardiomyopathy_rare, type = "link")
categorized_combined_data$epilepsy_variant_score <- predict(model_epilepsy_rare, type = "link")
categorized_combined_data$noncoding_variant_score <- predict(model_noncoding_rare, type = "link")
categorized_combined_data$common_variant_score <- predict(model_common, type = "link")
categorized_combined_data$common_and_coding_score <- predict(model_common_and_coding, type = "link")
categorized_combined_data$coding_rare_score <- predict(model_coding_rare, type = "link")
categorized_combined_data$epilepsy_and_common_score <- predict(model_epilepsy_and_common, type = "link")
categorized_combined_data$cardiac_and_common_score <- predict(model_cardiac_and_common, type = "link")


categorized_combined_data <- categorized_combined_data %>%
  mutate(
    rank = rank(scores, ties.method = "first"),
    score_quintiles = ceiling(rank / (n() / 5)),
    rank_cardiac = rank(cardiac_variant_score, ties.method = "first"),
    cardiac_score_quintiles = ceiling(rank_cardiac / (n() / 5)),
    rank_epilepsy = rank(epilepsy_variant_score, ties.method = "first"),
    epilepsy_score_quintiles = ceiling(rank_epilepsy / (n() / 5)),
    rank_noncoding = rank(noncoding_variant_score, ties.method = "first"),
    noncoding_score_quintiles = ceiling(rank_noncoding / (n() / 5)),
    rank_common = rank(common_variant_score, ties.method = "first"),
    common_score_quintiles = ceiling(rank_common / (n() / 5)),
    rank_common_and_coding = rank(common_and_coding_score, ties.method = "first"),
    common_and_coding_score_quintiles = ceiling(rank_common_and_coding / (n() / 5)),
    rank_model_coding_rare = rank(coding_rare_score, ties.method = "first"),
    coding_rare_score_quintiles = ceiling(rank_model_coding_rare / (n() / 5)),
    rank_epilepsy_and_common = rank(epilepsy_and_common_score, ties.method = "first"),
    epilepsy_and_common_score_quintiles = ceiling(rank_epilepsy_and_common / (n() / 5)),
    rank_cardiac_and_common = rank(cardiac_and_common_score, ties.method = "first"),
    cardiac_and_common_score_quintiles = ceiling(rank_cardiac_and_common / (n() / 5))
  )

categorized_combined_data <- categorized_combined_data %>%
  mutate(
    probability = plogis(scores),
    cardiac_probability = plogis(cardiac_variant_score),
    epilepsy_probability = plogis(epilepsy_variant_score),
    noncoding_probability = plogis(noncoding_variant_score),
    common_probability = plogis(common_variant_score),
    common_and_coding_probability = plogis(common_and_coding_score),
    coding_rare_probability = plogis(coding_rare_score),
    epilepsy_and_common_probability = plogis(epilepsy_and_common_score),
    cardiac_and_common_probability = plogis(cardiac_and_common_score),
  )


# Function to calculate odds ratio and CI
calculate_odds_ratios <- function(data, category_column, quintile_column) {
  # Compute counts and initial odds for each quintile
  odds_data <- data %>%
    dplyr::group_by({{ quintile_column }}) %>%
    dplyr::summarise(
      count_category_1 = sum({{ category_column }} == "Control", na.rm = TRUE),
      count_category_2_6 = sum({{ category_column }} == "zCase", na.rm = TRUE),
      .groups = 'drop'
    ) %>%
    dplyr::mutate(
      odds = count_category_2_6 / count_category_1
    )

  # Retrieve the odds of the first quintile as the reference
  first_quintile_odds <- odds_data$odds[1]

  # Calculate the odds ratio relative to the first quintile
  odds_data <- odds_data %>%
    dplyr::mutate(
      odds_ratio = odds / first_quintile_odds,
      se_log_odds_ratio = sqrt(1 / count_category_1 + 1 / count_category_2_6),
      lower_ci = exp(log(odds_ratio) - 1.96 * se_log_odds_ratio),
      upper_ci = exp(log(odds_ratio) + 1.96 * se_log_odds_ratio)
    )

  return(odds_data)
}

# Apply function to each odds category
combined_odds_summary = calculate_odds_ratios(categorized_combined_data, Category, score_quintiles)
cardiac_odds_summary = calculate_odds_ratios(categorized_combined_data,Category,  cardiac_score_quintiles)
epilepsy_summary = calculate_odds_ratios(categorized_combined_data,Category,  epilepsy_score_quintiles)
common_summary = calculate_odds_ratios(categorized_combined_data,Category,  common_score_quintiles)
noncoding_summary = calculate_odds_ratios(categorized_combined_data,Category,   noncoding_score_quintiles)
common_and_coding_summary = calculate_odds_ratios(categorized_combined_data,Category,   common_and_coding_score_quintiles)
coding_rare_summary = calculate_odds_ratios(categorized_combined_data,Category,  coding_rare_score_quintiles)
commmon_and_epilepsy_summary = calculate_odds_ratios(categorized_combined_data,Category, epilepsy_and_common_score_quintiles)
cardiac_and_common_summary = calculate_odds_ratios(categorized_combined_data,Category, cardiac_and_common_score_quintiles)


################################################################################################################ replication


categorized_replication_data$cardiac_variant_score_replication <- predict(model_cardiomyopathy_rare, newdata = categorized_replication_data,  type = "link")
categorized_replication_data$epilepsy_variant_score_replication <- predict(model_epilepsy_rare, newdata = categorized_replication_data,  type = "link")
categorized_replication_data$noncoding_variant_score_replication <- predict(model_noncoding_rare, newdata = categorized_replication_data,  type = "link")
categorized_replication_data$common_variant_score_replication <- predict(model_common, newdata = categorized_replication_data,  type = "link")
categorized_replication_data$common_and_coding_score_replication <- predict(model_common_and_coding, newdata = categorized_replication_data,  type = "link")
categorized_replication_data$coding_rare_score_replication <- predict(model_coding_rare, newdata = categorized_replication_data,  type = "link")
categorized_replication_data$epilepsy_and_common_score_replication <- predict(model_epilepsy_and_common, newdata = categorized_replication_data,  type = "link")
categorized_replication_data$cardiac_and_common_score_replication <- predict(model_cardiac_and_common, newdata = categorized_replication_data,  type = "link")


categorized_replication_data <- categorized_replication_data %>%
  mutate(
    rank = rank(scores_replication, ties.method = "first"),
    score_quintiles = ceiling(rank / (n() / 5)),
    rank_cardiac = rank(cardiac_variant_score_replication, ties.method = "first"),
    cardiac_score_quintiles = ceiling(rank_cardiac / (n() / 5)),
    rank_epilepsy = rank(epilepsy_variant_score_replication, ties.method = "first"),
    epilepsy_score_quintiles = ceiling(rank_epilepsy / (n() / 5)),
    rank_noncoding = rank(noncoding_variant_score_replication, ties.method = "first"),
    noncoding_score_quintiles = ceiling(rank_noncoding / (n() / 5)),
    rank_common = rank(common_variant_score_replication, ties.method = "first"),
    common_score_quintiles = ceiling(rank_common / (n() / 5)),
    rank_common_and_coding = rank(common_and_coding_score_replication, ties.method = "first"),
    common_and_coding_score_quintiles = ceiling(rank_common_and_coding / (n() / 5)),
    rank_model_coding_rare = rank(coding_rare_score_replication, ties.method = "first"),
    coding_rare_score_quintiles = ceiling(rank_model_coding_rare / (n() / 5)),
    rank_epilepsy_and_common = rank(epilepsy_and_common_score_replication, ties.method = "first"),
    epilepsy_and_common_score_quintiles = ceiling(rank_epilepsy_and_common / (n() / 5)),
    rank_cardiac_and_common = rank(cardiac_and_common_score_replication, ties.method = "first"),
    cardiac_and_common_score_quintiles = ceiling(rank_cardiac_and_common / (n() / 5))
  )

categorized_replication_data <- categorized_replication_data %>%
  mutate(
    probability = plogis(scores_replication),
    cardiac_probability = plogis(cardiac_variant_score_replication),
    epilepsy_probability = plogis(epilepsy_variant_score_replication),
    noncoding_probability = plogis(noncoding_variant_score_replication),
    common_probability = plogis(common_variant_score_replication),
    common_and_coding_probability = plogis(common_and_coding_score_replication),
    coding_rare_probability = plogis(coding_rare_score_replication),
    epilepsy_and_common_probability = plogis(epilepsy_and_common_score_replication),
    cardiac_and_common_probability = plogis(cardiac_and_common_score_replication),
  )


# Apply function to each odds category
combined_odds_summary_replication = calculate_odds_ratios(categorized_replication_data, Category, score_quintiles)
cardiac_odds_summary_replication = calculate_odds_ratios(categorized_replication_data,Category,  cardiac_score_quintiles)
epilepsy_summary_replication = calculate_odds_ratios(categorized_replication_data,Category,  epilepsy_score_quintiles)
common_summary_replication = calculate_odds_ratios(categorized_replication_data,Category,  common_score_quintiles)
noncoding_summary_replication = calculate_odds_ratios(categorized_replication_data,Category,   noncoding_score_quintiles)
common_and_coding_summary_replication = calculate_odds_ratios(categorized_replication_data,Category,   common_and_coding_score_quintiles)
coding_rare_summary_replication = calculate_odds_ratios(categorized_replication_data,Category,  coding_rare_score_quintiles)
commmon_and_epilepsy_summary_replication = calculate_odds_ratios(categorized_replication_data,Category, epilepsy_and_common_score_quintiles)
cardiac_and_common_summary_replication = calculate_odds_ratios(categorized_replication_data,Category, cardiac_and_common_score_quintiles)

plot

plot(log2(combined_odds_summary$odds_ratio), 
     ylim = c(
         log2(min(c(combined_odds_summary$lower_ci, cardiac_odds_summary$lower_ci, epilepsy_summary$lower_ci, 
                   common_summary$lower_ci, noncoding_summary$lower_ci, common_and_coding_summary$lower_ci, 
                   coding_rare_summary$lower_ci))), 
         log2(max(c(combined_odds_summary$upper_ci, cardiac_odds_summary$upper_ci, epilepsy_summary$upper_ci, 
                   common_summary$upper_ci, noncoding_summary$upper_ci, common_and_coding_summary$upper_ci, 
                   coding_rare_summary$upper_ci)))
     ), 
     pch = 19, xlab = "quintile", ylab = "Log2(Odds Ratio)", 
     main = "Log2(Odds Ratio) Across quintiles of Score with 95% CI", 

     
# Add error bars for 95% CI - Combined
          xaxt = "n", col = "#3C8C78")
lines(1:length(combined_odds_summary$odds_ratio), log2(combined_odds_summary$odds_ratio), col = "#3C8C78")
arrows(x0 = 1:length(combined_odds_summary$odds_ratio), 
       y0 = log2(combined_odds_summary$lower_ci), 
       y1 = log2(combined_odds_summary$upper_ci), 
       code = 3, angle = 90, length = 0.05, col = "#3C8C78")

# Cardiac
lines(1:length(cardiac_odds_summary$odds_ratio), log2(cardiac_odds_summary$odds_ratio), col = "#2E86C1")
points(log2(cardiac_odds_summary$odds_ratio), pch = 19, col = "#2E86C1")
arrows(x0 = 1:length(cardiac_odds_summary$odds_ratio), 
       y0 = log2(cardiac_odds_summary$lower_ci), 
       y1 = log2(cardiac_odds_summary$upper_ci), 
       code = 3, angle = 90, length = 0.05, col = "#2E86C1")

# Epilepsy
lines(1:length(epilepsy_summary$odds_ratio), log2(epilepsy_summary$odds_ratio), col = "#A93226")
points(log2(epilepsy_summary$odds_ratio), pch = 19, col = "#A93226")
arrows(x0 = 1:length(epilepsy_summary$odds_ratio), 
       y0 = log2(epilepsy_summary$lower_ci), 
       y1 = log2(epilepsy_summary$upper_ci), 
       code = 3, angle = 90, length = 0.05, col = "#A93226")

# Common
lines(1:length(common_summary$odds_ratio), log2(common_summary$odds_ratio), col = "#229954")
points(log2(common_summary$odds_ratio), pch = 19, col = "#229954")
arrows(x0 = 1:length(common_summary$odds_ratio), 
       y0 = log2(common_summary$lower_ci), 
       y1 = log2(common_summary$upper_ci), 
       code = 3, angle = 90, length = 0.05, col = "#229954")

# Noncoding
lines(1:length(noncoding_summary$odds_ratio), log2(noncoding_summary$odds_ratio), col = "#D68910")
points(log2(noncoding_summary$odds_ratio), pch = 19, col = "#D68910")
arrows(x0 = 1:length(noncoding_summary$odds_ratio), 
       y0 = log2(noncoding_summary$lower_ci), 
       y1 = log2(noncoding_summary$upper_ci), 
       code = 3, angle = 90, length = 0.05, col = "#D68910")

# common and coding summary
lines(1:length(common_and_coding_summary$odds_ratio), log2(common_and_coding_summary$odds_ratio), col = "Black")
points(log2(common_and_coding_summary$odds_ratio), pch = 19, col = "Black")
arrows(x0 = 1:length(common_and_coding_summary$odds_ratio), 
       y0 = log2(common_and_coding_summary$lower_ci), 
       y1 = log2(common_and_coding_summary$upper_ci), 
       code = 3, angle = 90, length = 0.05, col = "Black")

# coding rare summary
lines(1:length(coding_rare_summary$odds_ratio), log2(coding_rare_summary$odds_ratio), col = "Pink")
points(log2(coding_rare_summary$odds_ratio), pch = 19, col = "Pink")
arrows(x0 = 1:length(coding_rare_summary$odds_ratio), 
       y0 = log2(coding_rare_summary$lower_ci), 
       y1 = log2(coding_rare_summary$upper_ci), 
       code = 3, angle = 90, length = 0.05, col = "Pink")

# Add x-axis labels for quintiles
axis(1, at = 1:length(combined_odds_summary$odds_ratio), labels = TRUE)

# legend 
legend("topleft", legend = c("Combined", "Cardiac", "Epilepsy", "Common", "Noncoding", "common_and_coding", "model_coding_rare"), 
       col = c("#3C8C78", "#2E86C1", "#A93226", "#229954", "#D68910", "Black", "Pink"), pch = 19, cex = 0.8)

plot replication

plot(log2(combined_odds_summary_replication$odds_ratio), 
          ylim = c(-1,10), 
     pch = 19, xlab = "quintile", ylab = "Log2(Odds Ratio)", 
     main = "Log2(Odds Ratio) Across quintiles of Score with 95% CI", 
     
# Add error bars for 95% CI - Combined
          xaxt = "n", col = "#3C8C78")
lines(1:length(combined_odds_summary_replication$odds_ratio), log2(combined_odds_summary_replication$odds_ratio), col = "#3C8C78")
arrows(x0 = 1:length(combined_odds_summary_replication$odds_ratio), 
       y0 = log2(combined_odds_summary_replication$lower_ci), 
       y1 = log2(combined_odds_summary_replication$upper_ci), 
       code = 3, angle = 90, length = 0.05, col = "#3C8C78")

# Cardiac
lines(1:length(cardiac_odds_summary_replication$odds_ratio), log2(cardiac_odds_summary_replication$odds_ratio), col = "#2E86C1")
points(log2(cardiac_odds_summary_replication$odds_ratio), pch = 19, col = "#2E86C1")
arrows(x0 = 1:length(cardiac_odds_summary_replication$odds_ratio), 
       y0 = log2(cardiac_odds_summary_replication$lower_ci), 
       y1 = log2(cardiac_odds_summary_replication$upper_ci), 
       code = 3, angle = 90, length = 0.05, col = "#2E86C1")

# Epilepsy
lines(1:length(epilepsy_summary_replication$odds_ratio), log2(epilepsy_summary_replication$odds_ratio), col = "#A93226")
points(log2(epilepsy_summary_replication$odds_ratio), pch = 19, col = "#A93226")
arrows(x0 = 1:length(epilepsy_summary_replication$odds_ratio), 
       y0 = log2(epilepsy_summary_replication$lower_ci), 
       y1 = log2(epilepsy_summary_replication$upper_ci), 
       code = 3, angle = 90, length = 0.05, col = "#A93226")

# Common
lines(1:length(common_summary_replication$odds_ratio), log2(common_summary_replication$odds_ratio), col = "#229954")
points(log2(common_summary_replication$odds_ratio), pch = 19, col = "#229954")
arrows(x0 = 1:length(common_summary_replication$odds_ratio), 
       y0 = log2(common_summary_replication$lower_ci), 
       y1 = log2(common_summary_replication$upper_ci), 
       code = 3, angle = 90, length = 0.05, col = "#229954")

# Noncoding
lines(1:length(noncoding_summary_replication$odds_ratio), log2(noncoding_summary_replication$odds_ratio), col = "#D68910")
points(log2(noncoding_summary_replication$odds_ratio), pch = 19, col = "#D68910")
arrows(x0 = 1:length(noncoding_summary_replication$odds_ratio), 
       y0 = log2(noncoding_summary_replication$lower_ci), 
       y1 = log2(noncoding_summary_replication$upper_ci), 
       code = 3, angle = 90, length = 0.05, col = "#D68910")

# common and coding summary
lines(1:length(common_and_coding_summary_replication$odds_ratio), log2(common_and_coding_summary_replication$odds_ratio), col = "Black")
points(log2(common_and_coding_summary_replication$odds_ratio), pch = 19, col = "Black")
arrows(x0 = 1:length(common_and_coding_summary_replication$odds_ratio), 
       y0 = log2(common_and_coding_summary_replication$lower_ci), 
       y1 = log2(common_and_coding_summary_replication$upper_ci), 
       code = 3, angle = 90, length = 0.05, col = "Black")

# coding rare summary
lines(1:length(coding_rare_summary_replication$odds_ratio), log2(coding_rare_summary_replication$odds_ratio), col = "Pink")
points(log2(coding_rare_summary_replication$odds_ratio), pch = 19, col = "Pink")
arrows(x0 = 1:length(coding_rare_summary_replication$odds_ratio), 
       y0 = log2(coding_rare_summary_replication$lower_ci), 
       y1 = log2(coding_rare_summary_replication$upper_ci), 
       code = 3, angle = 90, length = 0.05, col = "Pink")

# Add x-axis labels for quintiles
axis(1, at = 1:length(combined_odds_summary_replication$odds_ratio), labels = TRUE)

# legend 
legend("topleft", legend = c("Combined", "Cardiac", "Epilepsy", "Common", "Noncoding", "common_and_coding", "model_coding_rare"), 
       col = c("#3C8C78", "#2E86C1", "#A93226", "#229954", "#D68910", "Black", "Pink"), pch = 19, cex = 0.8)

Z test for odds significance

compare_odds <- function(odds1, odds2) {
  # Calculate the difference in log odds
  log_odds1 <- log(odds1$odds_ratio)
  log_odds2 <- log(odds2$odds_ratio)
  delta_log_odds <- log_odds1 - log_odds2
  
  # Calculate the standard error of the difference
  se1 <- odds1$se_log_odds_ratio
  se2 <- odds2$se_log_odds_ratio
  se_delta_log_odds <- sqrt(se1^2 + se2^2)
  
  # Z-test for each quintile
  z_values <- delta_log_odds / se_delta_log_odds
  p_values <- 2 * pnorm(abs(z_values), lower.tail = FALSE)  
  
  # Return a data frame with the results
  return(data.frame(quintile = 1:length(p_values), p_values = p_values))
}

# Apply the function to compare 'coding_rare_summary', 'common_and_coding_summary', and 'combined_odds_summary'
p_values_coding_vs_common_coding <- compare_odds(coding_rare_summary, common_and_coding_summary)
p_values_coding_vs_model <- compare_odds(coding_rare_summary, combined_odds_summary)
p_values_common_coding_vs_model <- compare_odds(common_and_coding_summary, combined_odds_summary)

# Combine the data frames with an identifier for each comparison
p_values_coding_vs_common_coding$comparison <- "Coding vs Common and Coding"
p_values_coding_vs_model$comparison <- "Coding vs Model"
p_values_common_coding_vs_model$comparison <- "Common and Coding vs Model"

# Bind the rows together
combined_p_values <- bind_rows(p_values_coding_vs_common_coding,
                               p_values_coding_vs_model,
                               p_values_common_coding_vs_model)

# Convert quintile to a factor for better axis labeling
combined_p_values$quintile <- factor(combined_p_values$quintile, levels = 1:5)

combined_p_values
   quintile     p_values                  comparison
1         1 1.0000000000 Coding vs Common and Coding
2         2 0.2755794672 Coding vs Common and Coding
3         3 0.1193593972 Coding vs Common and Coding
4         4 0.0024478936 Coding vs Common and Coding
5         5 0.0375063182 Coding vs Common and Coding
6         1 1.0000000000             Coding vs Model
7         2 0.0802036186             Coding vs Model
8         3 0.0143006222             Coding vs Model
9         4 0.0005882193             Coding vs Model
10        5 0.0001029296             Coding vs Model
11        1 1.0000000000  Common and Coding vs Model
12        2 0.5116308088  Common and Coding vs Model
13        3 0.3728300451  Common and Coding vs Model
14        4 0.6892061513  Common and Coding vs Model
15        5 0.0598086456  Common and Coding vs Model

Calculate the scores per category and plot them

#plot
common_variant_score_plot <- ggplot(categorized_combined_data, aes(x = Category, y = common_variant_score, fill = Category)) +
        geom_boxplot(outlier.shape = NA, notch = TRUE) +  
    scale_fill_manual(values = group_colors) +
    ylim(-2, 2) +  
    theme_cowplot(12)

cardiac_variant_score_plot <-  ggplot(categorized_combined_data, aes(x = Category, y = cardiac_variant_score, fill = Category)) +
        geom_boxplot(outlier.shape = NA, notch = TRUE) +  
    scale_fill_manual(values = group_colors) +
    ylim(-1, 0.6) +  
    theme_cowplot(12)

epilepsy_variant_score_plot <-  ggplot(categorized_combined_data, aes(x = Category, y = epilepsy_variant_score, fill = Category)) +
        geom_boxplot(outlier.shape = NA, notch = TRUE) +  
    scale_fill_manual(values = group_colors) +
    ylim(-1, 0.6) +  
    theme_cowplot(12)

noncoding_variant_score_plot <-  ggplot(categorized_combined_data, aes(x = Category, y = noncoding_variant_score, fill = Category)) +
        geom_boxplot(outlier.shape = NA, notch = TRUE) +  
    scale_fill_manual(values = group_colors) +
    ylim(-1, 0.75) +  
    theme_cowplot(12)

scores_plot <-  ggplot(categorized_combined_data, aes(x = Category, y = scores, fill = Category)) +
        geom_boxplot(outlier.shape = NA, notch = TRUE) +  
    scale_fill_manual(values = group_colors) +
    ylim(-3, 3) +
    theme_cowplot(12)

suppressWarnings(print(common_variant_score_plot))

suppressWarnings(print(cardiac_variant_score_plot))

suppressWarnings(print(epilepsy_variant_score_plot))

suppressWarnings(print(noncoding_variant_score_plot))

suppressWarnings(print(scores_plot))

replication

NOW PLOT THE Z normalized replication data

mean_common_replication <- mean(categorized_replication_data$common_variant_score_replication)
sd_common_replication <- sd(categorized_replication_data$common_variant_score_replication)
mean_cardiac_replication <- mean(categorized_replication_data$cardiac_variant_score_replication)
sd_cardiac_replication <- sd(categorized_replication_data$cardiac_variant_score_replication)
mean_epilepsy_replication <- mean(categorized_replication_data$epilepsy_variant_score_replication)
sd_epilepsy_replication <- sd(categorized_replication_data$epilepsy_variant_score_replication)
mean_noncoding_replication <- mean(categorized_replication_data$noncoding_variant_score_replication)
sd_noncoding_replication <- sd(categorized_replication_data$noncoding_variant_score_replication)
mean_scores_replication <- mean(categorized_replication_data$scores_replication)
sd_scores_replication <- sd(categorized_replication_data$scores_replication)


common_variant_score_replication_plot <- ggplot(categorized_replication_data, aes(x = Category, y = (common_variant_score_replication - mean_common_replication) / sd_common_replication, fill = Category)) +
    geom_boxplot(outlier.shape = NA, notch = TRUE) +  
    scale_fill_manual(values = group_colors) +
    ylim(-2.5, 2.5) +
    theme_cowplot(12)

cardiac_variant_score_replication_plot <-  ggplot(categorized_replication_data, aes(x = Category, y = (cardiac_variant_score_replication - mean_cardiac_replication)/ sd_cardiac_replication, fill = Category)) +
    geom_boxplot(outlier.shape = NA, notch = TRUE) +  
    scale_fill_manual(values = group_colors) +
 ylim(-2.5, 2.5) +
  theme_cowplot(12)

epilepsy_variant_score_replication_plot <-  ggplot(categorized_replication_data, aes(x = Category, y = (epilepsy_variant_score_replication - mean_epilepsy_replication)/ sd_epilepsy_replication, fill = Category)) +
    geom_boxplot(outlier.shape = NA, notch = TRUE) +  
    scale_fill_manual(values = group_colors) +
     ylim(-2.5, 2.5) +
    theme_cowplot(12)

noncoding_variant_score_replication_plot <-  ggplot(categorized_replication_data, aes(x = Category, y = (noncoding_variant_score_replication - mean_noncoding_replication)/ sd_noncoding_replication, fill = Category)) +
    geom_boxplot(outlier.shape = NA, notch = TRUE) +  
    scale_fill_manual(values = group_colors) +
     ylim(-2.5, 2.5) +
    theme_cowplot(12)

scores_replication_plot <-   ggplot(categorized_replication_data, aes(x = Category, y = (scores_replication - mean_scores_replication)/ sd_scores_replication, fill = Category)) +
    geom_boxplot(outlier.shape = NA, notch = TRUE) +  
    scale_fill_manual(values = group_colors) +
     ylim(-2.5, 2.5) +
    theme_cowplot(12)

suppressWarnings(print(common_variant_score_replication_plot))

suppressWarnings(print(cardiac_variant_score_replication_plot))
Notch went outside hinges
ℹ Do you want `notch = FALSE`?

suppressWarnings(print(epilepsy_variant_score_replication_plot))

suppressWarnings(print(noncoding_variant_score_replication_plot))

suppressWarnings(print(scores_replication_plot))

Compute the wilcox.tests

wilcox.test_cardiomyopathy <- wilcox.test(cardiac_variant_score ~ Category, data = categorized_combined_data)
wilcox.test_cardiomyopathy

    Wilcoxon rank sum test with continuity correction

data:  cardiac_variant_score by Category
W = 91701, p-value < 2.2e-16
alternative hypothesis: true location shift is not equal to 0
wilcox.test_epilepsy <- wilcox.test(epilepsy_variant_score ~ Category, data = categorized_combined_data)
wilcox.test_epilepsy

    Wilcoxon rank sum test with continuity correction

data:  epilepsy_variant_score by Category
W = 104718, p-value = 1.487e-10
alternative hypothesis: true location shift is not equal to 0
wilcox.test_common <- wilcox.test(common_variant_score ~ Category, data = categorized_combined_data)
wilcox.test_common

    Wilcoxon rank sum test with continuity correction

data:  common_variant_score by Category
W = 93124, p-value < 2.2e-16
alternative hypothesis: true location shift is not equal to 0
wilcox.test_noncoding <- wilcox.test(noncoding_variant_score ~ Category, data = categorized_combined_data)
wilcox.test_noncoding

    Wilcoxon rank sum test with continuity correction

data:  noncoding_variant_score by Category
W = 115548, p-value = 2.943e-05
alternative hypothesis: true location shift is not equal to 0
wilcox.test_combined <- wilcox.test(scores ~ Category, data = categorized_combined_data)
wilcox.test_combined

    Wilcoxon rank sum test with continuity correction

data:  scores by Category
W = 71316, p-value < 2.2e-16
alternative hypothesis: true location shift is not equal to 0

replication

wilcox.test_cardiomyopathy_replication <- wilcox.test(cardiac_variant_score_replication ~ Category, data = categorized_replication_data)
wilcox.test_cardiomyopathy_replication

    Wilcoxon rank sum test with continuity correction

data:  cardiac_variant_score_replication by Category
W = 430, p-value = 3.978e-14
alternative hypothesis: true location shift is not equal to 0
wilcox.test_epilepsy_replication <- wilcox.test(epilepsy_variant_score_replication ~ Category, data = categorized_replication_data)
wilcox.test_epilepsy_replication

    Wilcoxon rank sum test with continuity correction

data:  epilepsy_variant_score_replication by Category
W = 734.5, p-value = 1.268e-09
alternative hypothesis: true location shift is not equal to 0
wilcox.test_common_replication <- wilcox.test(common_variant_score_replication ~ Category, data = categorized_replication_data)
wilcox.test_common_replication

    Wilcoxon rank sum test with continuity correction

data:  common_variant_score_replication by Category
W = 1804, p-value = 0.4004
alternative hypothesis: true location shift is not equal to 0
wilcox.test_noncoding_replication <- wilcox.test(noncoding_variant_score_replication ~ Category, data = categorized_replication_data)
wilcox.test_noncoding_replication

    Wilcoxon rank sum test with continuity correction

data:  noncoding_variant_score_replication by Category
W = 497, p-value = 4.796e-13
alternative hypothesis: true location shift is not equal to 0
wilcox.test_combined_replication <- wilcox.test(scores_replication ~ Category, data = categorized_replication_data)
wilcox.test_combined_replication

    Wilcoxon rank sum test with continuity correction

data:  scores_replication by Category
W = 442, p-value = 6.391e-14
alternative hypothesis: true location shift is not equal to 0

Plot the distribution of the categories by score

distribution_score_plot <- ggplot(categorized_combined_data, aes(x = scores, fill = Category)) + 
  geom_density(aes(y = ..density..), alpha = 0.5, adjust = 1) +
  scale_fill_manual(values = group_colors) +
    scale_x_continuous(limits = c(-3, 4)) +
  labs(title = "Normalized Density Plots of Scores by Category",
       x = "Score",
       y = "Density") +
  theme_cowplot(12) 


suppressWarnings(print(distribution_score_plot))

Plot the filled density of the categories by score

density_score_plot <- ggplot(categorized_combined_data, aes(x = scores, fill = Category)) + 
  geom_density(position = "fill", alpha = 0.5) + 
  scale_y_continuous(labels = scales::percent) +
  scale_x_continuous(limits = c(-3, 4)) +
  labs(title = "Stacked Density of Scores by Category",
       x = "Score",
       y = "Fraction") +
  theme_cowplot() +
  scale_fill_manual(values = group_colors)

# Print the plot
suppressWarnings(print(density_score_plot))

Plot the ROC and Precision-recall

# Convert 'Category' into a binary format: 0 for control (1), 1 for case (2-4)
categorized_combined_data$binary_outcome <- ifelse(categorized_combined_data$Category == "Control", 0, 1)

#Full model
roc_result_full <- roc(categorized_combined_data$binary_outcome, categorized_combined_data$probability, plot = TRUE)
Setting levels: control = 0, case = 1
Setting direction: controls < cases

paste("Full model AUC:")
[1] "Full model AUC:"
auc(roc_result_full)
Area under the curve: 0.7375
#common variants
roc_result_common <- roc(categorized_combined_data$binary_outcome, categorized_combined_data$common_probability, plot = TRUE)
Setting levels: control = 0, case = 1
Setting direction: controls < cases

paste("commonvariant model AUC:")
[1] "commonvariant model AUC:"
auc(roc_result_common)
Area under the curve: 0.6573
#coding variants
roc_result_coding <- roc(categorized_combined_data$binary_outcome, categorized_combined_data$coding_rare_probability, plot = TRUE)
Setting levels: control = 0, case = 1
Setting direction: controls < cases

paste("coding variant model AUC:")
[1] "coding variant model AUC:"
auc(roc_result_coding)
Area under the curve: 0.6709
#common and coding variants
roc_result_common_and_coding <- roc(categorized_combined_data$binary_outcome, categorized_combined_data$common_and_coding_probability, plot = TRUE)
Setting levels: control = 0, case = 1
Setting direction: controls < cases

paste("common and coding variant model AUC:")
[1] "common and coding variant model AUC:"
auc(roc_result_common_and_coding)
Area under the curve: 0.7222

Plot the ROC and Precision-recall

# Convert 'Category' into a binary format: 0 for control (1), 1 for case (2-4)
categorized_replication_data$binary_outcome <- ifelse(categorized_replication_data$Category == "Control", 0, 1)

roc_result_full_replication <- roc(categorized_replication_data$binary_outcome, categorized_replication_data$probability, plot = TRUE)
Setting levels: control = 0, case = 1
Setting direction: controls < cases

auc(roc_result_full_replication)
Area under the curve: 0.8882
#compute the ones not plotted
roc_result_cardiac <- roc(categorized_combined_data$binary_outcome, categorized_combined_data$cardiac_probability, plot = FALSE)
Setting levels: control = 0, case = 1
Setting direction: controls < cases
roc_result_epilepsy <- roc(categorized_combined_data$binary_outcome, categorized_combined_data$epilepsy_probability, plot = FALSE)
Setting levels: control = 0, case = 1
Setting direction: controls < cases
roc_result_epilepsy_and_common <- roc(categorized_combined_data$binary_outcome, categorized_combined_data$epilepsy_and_common_probability, plot = FALSE)
Setting levels: control = 0, case = 1
Setting direction: controls < cases
roc_result_cardiac_and_common <- roc(categorized_combined_data$binary_outcome, categorized_combined_data$cardiac_and_common_probability, plot = FALSE)
Setting levels: control = 0, case = 1
Setting direction: controls < cases
# Calculate AUC for each ROC object
auc_epilepsy <- auc(roc_result_epilepsy)
auc_cardiac <- auc(roc_result_cardiac)
auc_common <- auc(roc_result_common)
auc_epilepsy_and_common <- auc(roc_result_epilepsy_and_common)
auc_cardiac_and_common <- auc(roc_result_cardiac_and_common)
auc_common_and_coding <- auc(roc_result_common_and_coding)
auc_coding <- auc(roc_result_coding)
auc_full <- auc(roc_result_full)

# Base AUC for random chance
auc_random_chance <- 0.5

# Create a data frame for plotting
percentage_changes_df <- data.frame(
  Model = c("Epilepsy","Cardiac","Common", "Epilepsy and Common", "Cardiac and Common", "Common and Coding", "Coding", "Full"),
  PercentageChange = c(
    (auc_epilepsy - auc_random_chance) / auc_random_chance * 100,
    (auc_cardiac - auc_random_chance) / auc_random_chance * 100,
    (auc_common - auc_random_chance) / auc_random_chance * 100,
    (auc_epilepsy_and_common - auc_random_chance) / auc_random_chance * 100,
    (auc_cardiac_and_common - auc_random_chance) / auc_random_chance * 100,
    (auc_common_and_coding - auc_random_chance) / auc_random_chance * 100,
    (auc_coding - auc_random_chance) / auc_random_chance * 100,
    (auc_full - auc_random_chance) / auc_random_chance * 100
  )
)


auc_df <- data.frame(
  Model = c("Epilepsy","Cardiac","Common", "Epilepsy and Common", "Cardiac and Common", "Common and Coding", "Coding", "Full"),
  auc = c(
    (auc_epilepsy),
    (auc_cardiac),
    (auc_common),
    (auc_epilepsy_and_common),
    (auc_cardiac_and_common),
    (auc_common_and_coding),
    (auc_coding),
    (auc_full)
  )
)

# Set factor levels
percentage_changes_df$Model <- factor(percentage_changes_df$Model, levels = c(
  "Epilepsy","Cardiac","Coding", "Common", "Epilepsy and Common", "Cardiac and Common", "Common and Coding", "Full"
))

# Plot
auc_performance_plot <- ggplot(percentage_changes_df, aes(x = Model, y = PercentageChange, fill = Model)) +
  geom_bar(stat = "identity", position = position_dodge()) +
  theme_minimal() +
  labs(title = "Percentage Change Over Random Chance AUC",
       y = "Percentage Change (%)",
       x = "") +
  theme_cowplot(12)

print(auc_performance_plot)

# Calculate p-values using DeLong's test
p_values <- list(
  "Epilepsy vs CMAR" = roc.test(roc_result_epilepsy, roc_result_cardiac)$p.value,
  "CMAR vs Common" = roc.test(roc_result_common, roc_result_cardiac)$p.value,
  "Epilepsy vs Common" = roc.test(roc_result_common, roc_result_epilepsy)$p.value,
  "Coding vs CMAR" = roc.test(roc_result_coding, roc_result_cardiac)$p.value,
  "Coding vs Epilepsy" = roc.test(roc_result_coding, roc_result_epilepsy)$p.value,
  "CMAR and Common vs Common" = roc.test(roc_result_cardiac_and_common, roc_result_common)$p.value,
  "Common vs Coding" = roc.test(roc_result_common, roc_result_coding)$p.value,
  "CMAR and common vs Common and Coding" = roc.test(roc_result_cardiac_and_common, roc_result_common_and_coding)$p.value,
  "Common vs Common and Coding" = roc.test(roc_result_common, roc_result_common_and_coding)$p.value,
  "Full vs Common and Coding" = roc.test(roc_result_common_and_coding, roc_result_full)$p.value
)

# Print p-values
print(p_values)
$`Epilepsy vs CMAR`
[1] 0.01751543

$`CMAR vs Common`
[1] 0.8050644

$`Epilepsy vs Common`
[1] 0.03735391

$`Coding vs CMAR`
[1] 0.3745788

$`Coding vs Epilepsy`
[1] 6.97466e-05

$`CMAR and Common vs Common`
[1] 2.191874e-06

$`Common vs Coding`
[1] 0.5104602

$`CMAR and common vs Common and Coding`
[1] 0.08744431

$`Common vs Common and Coding`
[1] 2.504464e-07

$`Full vs Common and Coding`
[1] 0.0170021

Lets see how common and rare cardiac vars interact

# Plot
dot_plot <- ggplot(categorized_combined_data, aes(x = rank_cardiac, y = rank_common, color = Category)) +
  geom_point(alpha = 0.7) +  
  scale_color_manual(values = c("1" = "blue", "2-3" = "green", "4-6" = "red")) + 
  labs(x = "Cardiac Variant Score Rank", y = "Common Variant Score Rank", color = "Arrest Status") +
  theme_cowplot(12) +
  theme(strip.text.x = element_text(size = 10))


# Plot
dot_plot <- ggplot(categorized_combined_data, aes(x = rank_cardiac, y = rank_common, color = Category)) +
  geom_point(alpha = 1) +  
  scale_color_manual(values = group_colors) + 
  labs(x = "Cardiac Variant Score Rank", y = "Common Variant Score Rank", color = "Arrest Status") +
  theme_cowplot(12) +
  theme(strip.text.x = element_text(size = 10))

dot_plot

median_cardiac <- median(categorized_combined_data$rank_cardiac, na.rm = TRUE)
median_common <- median(categorized_combined_data$rank_common, na.rm = TRUE)

# Filter for the top half
quadrant1 <- categorized_combined_data %>%
  filter(rank_cardiac <= median_cardiac & rank_common <= median_common)

quadrant2 <- categorized_combined_data %>%
  filter(rank_cardiac < median_cardiac & rank_common > median_common)

quadrant3 <- categorized_combined_data %>%
  filter(rank_cardiac > median_cardiac & rank_common < median_common)

quadrant4 <- categorized_combined_data %>%
  filter(rank_cardiac >= median_cardiac & rank_common >= median_common)


# Combine into one data frame
combined_quadrants <- bind_rows(
  quadrant1 %>% mutate(Quadrant = 'Quadrant 1'),
  quadrant2 %>% mutate(Quadrant = 'Quadrant 2'),
  quadrant3 %>% mutate(Quadrant = 'Quadrant 3'),
  quadrant4 %>% mutate(Quadrant = 'Quadrant 4')
)

# Calculate the count of individuals in each quadrant for each category
# and the total count of individuals in each category
percentage_by_category <- combined_quadrants %>%
  group_by(Category, Quadrant) %>%
  summarise(Count = n(), .groups = 'drop') %>%
  group_by(Category) %>%
  mutate(Total = sum(Count), Percentage = Count / Total * 100)

# View the result
percentage_by_category
# A tibble: 8 × 5
# Groups:   Category [2]
  Category Quadrant   Count Total Percentage
  <fct>    <chr>      <int> <int>      <dbl>
1 Control  Quadrant 1   210   537       39.1
2 Control  Quadrant 2   117   537       21.8
3 Control  Quadrant 3   112   537       20.9
4 Control  Quadrant 4    98   537       18.2
5 zCase    Quadrant 1    99   506       19.6
6 zCase    Quadrant 2    95   506       18.8
7 zCase    Quadrant 3   101   506       20.0
8 zCase    Quadrant 4   211   506       41.7
# Calculate overall proportions for each quadrant
overall_counts <- combined_quadrants %>%
  count(Quadrant) %>%
  mutate(OverallProportion = n / sum(n))

# Calculate total counts by category for scaling expected values
category_totals <- combined_quadrants %>%
  count(Category) %>%
  dplyr::rename(TotalInCategory = n)

# Calculate expected counts
expected_counts <- combined_quadrants %>%
  dplyr::select(Category, Quadrant) %>%
  distinct() %>%
  left_join(overall_counts, by = "Quadrant") %>%
  left_join(category_totals, by = "Category") %>%
  mutate(Expected = OverallProportion * TotalInCategory)

# Calculate observed counts
observed_counts <- combined_quadrants %>%
  count(Category, Quadrant) %>%
  dplyr::rename(Observed = n)

# combine
combined_for_plotting <- combined_quadrants %>%
  count(Category, Quadrant) %>%
  dplyr::rename(Observed = n) %>%
  left_join(overall_counts, by = "Quadrant") %>%
  left_join(category_totals, by = "Category") %>%
  mutate(Expected = OverallProportion * TotalInCategory, Ratio = Observed / Expected)


# Calculate the Expected/Observed ratio
combined_for_plotting <- combined_for_plotting %>%
  mutate(Ratio = Observed / Expected)

# Plot the Expected/Observed ratio
quadrant_plot <- ggplot(combined_for_plotting, aes(x = Quadrant, y = Ratio, fill = Category)) +
  geom_bar(stat = "identity", position = position_dodge()) +
  labs(y = "Expected/Observed Ratio", x = "Quadrant", title = "Expected/Observed Ratios by Category and Quadrant") +
  theme_cowplot(12) +
  scale_fill_manual(values = group_colors) +  
  geom_hline(yintercept = 1, linetype = "dashed", color = "Black")

quadrant_plot

# Create the contingency table
contingency_table <- xtabs(Observed ~ Category + Quadrant, data = observed_counts)

# Print the contingency table
print(contingency_table)
         Quadrant
Category  Quadrant 1 Quadrant 2 Quadrant 3 Quadrant 4
  Control        210        117        112         98
  zCase           99         95        101        211
# Chi-squared test
chi_squared_result <- chisq.test(contingency_table)

# Print the results of the chi-squared test
print(chi_squared_result)

    Pearson's Chi-squared test

data:  contingency_table
X-squared = 83.201, df = 3, p-value < 2.2e-16
# Filter the data for cardiac_score_quintiles > 3
top_3_quintiles <- categorized_combined_data %>%
  filter(cardiac_score_quintiles > 3)


common_density_score_plot <- ggplot(top_3_quintiles, aes(x = cardiac_variant_score, y = common_variant_score)) +
  geom_point(aes(color = Category), alpha = 0.5) +  
  geom_smooth(method = "lm", aes(group = Category, color = Category)) +  
  labs(x = "Cardiac Variant Score", y = "Common Variant Score", title = "Cardiac Variant Score vs. Common Variant Score by Category") +
  scale_color_manual(values = group_colors) +  
  theme_cowplot(12)  

# Highlight the specific point in red
common_density_score_plot <- common_density_score_plot +
  geom_point(data = subset(top_3_quintiles, ID == "CGM0000563"), aes(x = cardiac_variant_score, y = common_variant_score), color = "red")

# Print 
print(common_density_score_plot)
`geom_smooth()` using formula = 'y ~ x'

#Do the stats
model_common_cardiac <- lm(common_variant_score ~ cardiac_variant_score * Category, data = top_3_quintiles)
summary(model_common_cardiac)

Call:
lm(formula = common_variant_score ~ cardiac_variant_score * Category, 
    data = top_3_quintiles)

Residuals:
     Min       1Q   Median       3Q      Max 
-2.97751 -0.32432  0.02594  0.40306  1.52690 

Coefficients:
                                    Estimate Std. Error t value Pr(>|t|)    
(Intercept)                         -0.14428    0.05354  -2.695  0.00733 ** 
cardiac_variant_score               -0.17503    0.08772  -1.995  0.04667 *  
CategoryzCase                        0.36154    0.06888   5.249 2.45e-07 ***
cardiac_variant_score:CategoryzCase  0.17451    0.08979   1.943  0.05264 .  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.6156 on 414 degrees of freedom
Multiple R-squared:  0.09912,   Adjusted R-squared:  0.0926 
F-statistic: 15.18 on 3 and 414 DF,  p-value: 2.158e-09
epilepsy_density_score_plot <- ggplot(top_3_quintiles, aes(x = cardiac_variant_score, y = epilepsy_variant_score)) +
  geom_point(aes(color = Category), alpha = 0.5) +  
  geom_smooth(method = "lm", aes(group = Category, color = Category)) +  
  labs(x = "Cardiac Variant Score", y = "Epilepsy Variant Score", title = "Cardiac Variant Score vs. Epilepsy Variant Score by Category") +
  scale_color_manual(values = group_colors) +  
  theme_cowplot(12)  

# Highlight the specific point in red
epilepsy_density_score_plot <- epilepsy_density_score_plot +
  geom_point(data = subset(top_3_quintiles, ID == "CGM0000563"), aes(x = cardiac_variant_score, y = epilepsy_variant_score), color = "red")


# Print 
print(epilepsy_density_score_plot)
`geom_smooth()` using formula = 'y ~ x'

#Do the stats
model_epilepsy_cardiac <- lm(epilepsy_variant_score ~ cardiac_variant_score * Category, data = top_3_quintiles)
summary(model_epilepsy_cardiac)

Call:
lm(formula = epilepsy_variant_score ~ cardiac_variant_score * 
    Category, data = top_3_quintiles)

Residuals:
    Min      1Q  Median      3Q     Max 
-4.2366 -0.3230 -0.0335  0.2956 10.1092 

Coefficients:
                                    Estimate Std. Error t value Pr(>|t|)    
(Intercept)                         -0.04534    0.07551  -0.600   0.5485    
cardiac_variant_score               -0.12827    0.12372  -1.037   0.3004    
CategoryzCase                       -0.21357    0.09714  -2.199   0.0285 *  
cardiac_variant_score:CategoryzCase  0.71880    0.12664   5.676  2.6e-08 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.8682 on 414 degrees of freedom
Multiple R-squared:  0.5496,    Adjusted R-squared:  0.5463 
F-statistic: 168.4 on 3 and 414 DF,  p-value: < 2.2e-16
# do wilcox_test
CMARR_epilepsy_wilcox_test_results <- wilcox.test(epilepsy_variant_score ~ Category,
                                  data = top_3_quintiles %>% filter(Category %in% c("zCase", "Control")))



CMARR_epilepsy_wilcox_test_results

    Wilcoxon rank sum test with continuity correction

data:  epilepsy_variant_score by Category
W = 13768, p-value = 2.933e-08
alternative hypothesis: true location shift is not equal to 0
CMARR_common_wilcox_test_results <- wilcox.test(common_variant_score ~ Category,
                                  data = top_3_quintiles %>% filter(Category %in% c("zCase", "Control")))


CMARR_common_wilcox_test_results

    Wilcoxon rank sum test with continuity correction

data:  common_variant_score by Category
W = 13423, p-value = 5.446e-09
alternative hypothesis: true location shift is not equal to 0
no_PLPs_categorized_data <- categorized_combined_data %>%
  filter(PLP_Cardiomyopathy < 1 & probability > 0.7)

# Count the number of rows where arrest_group equals 1 (Controls)
paste("number of PLP negative DISCOVERY controls over the threshold")
[1] "number of PLP negative DISCOVERY controls over the threshold"
nrow(no_PLPs_categorized_data %>% filter(arrest_group == 1))
[1] 10
# Count the number of rows where arrest_group equals > 1 (Cases)
paste("number of PLP negative DISCOVERY cases over the threshold")
[1] "number of PLP negative DISCOVERY cases over the threshold"
nrow(no_PLPs_categorized_data %>% filter(arrest_group > 1))
[1] 57
#for replication
no_PLPs_categorized_replication_data <- categorized_replication_data %>%
  filter(PLP_Cardiomyopathy < 1 & probability > 0.7)

# Count the number of rows where arrest_group equals 1 (Controls)
paste("number of PLP negative REPLICATION controls over the threshold")
[1] "number of PLP negative REPLICATION controls over the threshold"
nrow(no_PLPs_categorized_replication_data %>% filter(arrest_group == 1))
[1] 0
# Count the number of rows where arrest_group equals > 1 (Cases)
paste("number of PLP negative REPLICATION cases over the threshold")
[1] "number of PLP negative REPLICATION cases over the threshold"
nrow(no_PLPs_categorized_replication_data %>% filter(arrest_group > 1))
[1] 27

NOW, retrain the model on the validation cohort and apply it to the discovery cohort

model_replication <- glm(Category ~ Normalized_Heart_rate + Normalized_PR_interval + Normalized_QTc + Normalized_BP + Normalized_QRS + Normalized_LVEF + Normalized_LVESV + Normalized_SVI + Normalized_Afib + Epilepsy_rare + Epilepsy_ultrarare + Cardiomyopathy_rare + Cardiomyopathy_ultrarare  + PLP_Epilepsy + PLP_Cardiomyopathy + + Cardiomyopathy_noncoding_rare  + Epilepsy_noncoding_rare + Cardiomyopathy_null + Epilepsy_null, 
             data = categorized_replication_data, family = binomial())
Warning: glm.fit: algorithm did not converge
Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
model_replication

Call:  glm(formula = Category ~ Normalized_Heart_rate + Normalized_PR_interval + 
    Normalized_QTc + Normalized_BP + Normalized_QRS + Normalized_LVEF + 
    Normalized_LVESV + Normalized_SVI + Normalized_Afib + Epilepsy_rare + 
    Epilepsy_ultrarare + Cardiomyopathy_rare + Cardiomyopathy_ultrarare + 
    PLP_Epilepsy + PLP_Cardiomyopathy + +Cardiomyopathy_noncoding_rare + 
    Epilepsy_noncoding_rare + Cardiomyopathy_null + Epilepsy_null, 
    family = binomial(), data = categorized_replication_data)

Coefficients:
                  (Intercept)          Normalized_Heart_rate  
                     -32.4336                        -1.9711  
       Normalized_PR_interval                 Normalized_QTc  
                      -3.5941                       -13.7152  
                Normalized_BP                 Normalized_QRS  
                      -2.1713                         9.7180  
              Normalized_LVEF               Normalized_LVESV  
                      -5.8998                       -23.3856  
               Normalized_SVI                Normalized_Afib  
                       2.4789                         8.0705  
                Epilepsy_rare             Epilepsy_ultrarare  
                      -6.1431                       -64.2490  
          Cardiomyopathy_rare       Cardiomyopathy_ultrarare  
                      -6.8631                        -4.2113  
                 PLP_Epilepsy             PLP_Cardiomyopathy  
                     -32.7781                         9.8564  
Cardiomyopathy_noncoding_rare        Epilepsy_noncoding_rare  
                       0.6944                         3.6171  
          Cardiomyopathy_null                  Epilepsy_null  
                      -8.5436                       -32.4137  

Degrees of Freedom: 125 Total (i.e. Null);  106 Residual
Null Deviance:      174.2 
Residual Deviance: 1.524e-08    AIC: 40
scores_replication_model_on_discovery <- predict(model_replication, newdata = categorized_combined_data,  type = "link")

# Add scores to the dataframes
categorized_combined_data$scores_replication_model_on_discovery <- scores_replication_model_on_discovery

# z normalize
mean_replication_on_discovery <- mean(categorized_combined_data$scores_replication_model_on_discovery)
sd_replication_on_discovery <- sd(categorized_combined_data$scores_replication_model_on_discovery)


scores_replication_model_on_discovery_plot <-  ggplot(categorized_combined_data, aes(x = Category, y = (scores_replication_model_on_discovery - mean_replication_on_discovery)/sd_replication_on_discovery, fill = Category)) +
        geom_boxplot(outlier.shape = NA, notch = TRUE) +  
    scale_fill_manual(values = group_colors) +
    ylim(-2.5, 2.5) +  
    theme_cowplot(12)

scores_replication_model_on_discovery_plot
Warning: Removed 6 rows containing non-finite outside the scale range
(`stat_boxplot()`).

# wilcox.test
wilcox.test_result_replication_on_discovery <- wilcox.test(scores_replication_model_on_discovery ~ Category, data = categorized_combined_data)

# View the results
print(wilcox.test_result_replication_on_discovery)

    Wilcoxon rank sum test with continuity correction

data:  scores_replication_model_on_discovery by Category
W = 118506, p-value = 0.0003579
alternative hypothesis: true location shift is not equal to 0

Replication on replication

scores_replication_model_on_replication <- predict(model_replication, newdata = categorized_replication_data,  type = "link")

# Add scores to the dataframes
categorized_replication_data$scores_replication_model_on_replication <- scores_replication_model_on_replication

# z normalize
mean_replication_on_replication <- mean(categorized_replication_data$scores_replication_model_on_replication)
sd_replication_on_replication <- sd(categorized_replication_data$scores_replication_model_on_replication)

scores_replication_model_on_replication_plot <-  ggplot(categorized_replication_data, aes(x = Category, y = (scores_replication_model_on_replication - mean_replication_on_replication)/sd_replication_on_replication, fill = Category)) +
        geom_boxplot(outlier.shape = NA, notch = TRUE) +  
    scale_fill_manual(values = group_colors) +
    ylim(-2.5, 2.5) +  
    theme_cowplot(12)

scores_replication_model_on_replication_plot
Warning: Removed 2 rows containing non-finite outside the scale range
(`stat_boxplot()`).

# t-test
wilcox.test_replication_on_replication <- wilcox.test(scores_replication_model_on_replication ~ Category, data = categorized_replication_data)

# View the results
print(wilcox.test_replication_on_replication)

    Wilcoxon rank sum test with continuity correction

data:  scores_replication_model_on_replication by Category
W = 0, p-value < 2.2e-16
alternative hypothesis: true location shift is not equal to 0

NOW DO AUC FOR ORIGINAL DATA USING REPLICATION COEFFICIEANTS

categorized_combined_data <- categorized_combined_data %>%
  mutate(
    probability_replication_on_discovery = plogis(scores_replication_model_on_discovery),
  )

# Convert 'Category' into a binary format: 0 for control (1), 1 for case (2-4)
categorized_combined_data$binary_outcome <- ifelse(categorized_combined_data$Category == "Control", 0, 1)

roc_result_full <- roc(categorized_combined_data$binary_outcome, categorized_combined_data$probability_replication_on_discovery, plot = TRUE)
Setting levels: control = 0, case = 1
Setting direction: controls < cases

auc(roc_result_full)
Area under the curve: 0.5554

Replication model on discovery data Odds Ratio

categorized_combined_data <- categorized_combined_data %>%
  mutate(
    replication_model_on_discovery_rank = rank(scores_replication_model_on_discovery, ties.method = "first"),
    replication_model_on_discovery_score_quintiles = ceiling(replication_model_on_discovery_rank / (n() / 5)),
  )

 
# Apply function to each odds category
replication_model_on_discovery_combined_odds_summary = calculate_odds_ratios(categorized_combined_data, Category, replication_model_on_discovery_score_quintiles)


plot(log2(replication_model_on_discovery_combined_odds_summary$odds_ratio), 
     ylim = c(-2,2
     ), 
     pch = 19, xlab = "quintile", ylab = "Log2(Odds Ratio)", 
     main = "Log2(Odds Ratio) Across quintiles of Score with 95% CI", 
     xaxt = "n", col = "#3C8C78")
lines(1:length(replication_model_on_discovery_combined_odds_summary$odds_ratio), log2(replication_model_on_discovery_combined_odds_summary$odds_ratio), col = "#3C8C78")

# Add error bars for 95% CI - Combined
arrows(x0 = 1:length(replication_model_on_discovery_combined_odds_summary$odds_ratio), 
       y0 = log2(replication_model_on_discovery_combined_odds_summary$lower_ci), 
       y1 = log2(replication_model_on_discovery_combined_odds_summary$upper_ci), 
       code = 3, angle = 90, length = 0.05, col = "#3C8C78")

ANCESTRY

# Filter the data frame to include only rows where Category == 1
filtered_categorized_combined_data <- subset(categorized_combined_data, Category == "Control")

# Run the regression of scores on C1 to C10
ancestry_regression_model <- lm(scores ~ PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10 + 
                                PC11 + PC12 + PC13 + PC14 + PC15 + PC16 + PC17 + PC18 + PC19 + PC20, 
                                data = filtered_categorized_combined_data)

# Get the coefficients from the regression
ancestry_coefficients <- coef(ancestry_regression_model)

# Calculate the adjusted_scores by removing the effects of C1 to C10
categorized_combined_data$adjusted_scores <- categorized_combined_data$scores - 
                                             (ancestry_coefficients["(Intercept)"] + 
                                              categorized_combined_data$PC1 * ancestry_coefficients["PC1"] + 
                                              categorized_combined_data$PC2 * ancestry_coefficients["PC2"] + 
                                              categorized_combined_data$PC3 * ancestry_coefficients["PC3"] + 
                                              categorized_combined_data$PC4 * ancestry_coefficients["PC4"] + 
                                              categorized_combined_data$PC5 * ancestry_coefficients["PC5"] +
                                              categorized_combined_data$PC6 * ancestry_coefficients["PC6"] +
                                              categorized_combined_data$PC7 * ancestry_coefficients["PC7"] +
                                              categorized_combined_data$PC8 * ancestry_coefficients["PC8"] +
                                              categorized_combined_data$PC9 * ancestry_coefficients["PC9"] +
                                              categorized_combined_data$PC10 * ancestry_coefficients["PC10"] + 
                                              categorized_combined_data$PC11 * ancestry_coefficients["PC11"] + 
                                              categorized_combined_data$PC12 * ancestry_coefficients["PC12"] + 
                                              categorized_combined_data$PC13 * ancestry_coefficients["PC13"] + 
                                              categorized_combined_data$PC14 * ancestry_coefficients["PC14"] + 
                                              categorized_combined_data$PC15 * ancestry_coefficients["PC15"] +
                                              categorized_combined_data$PC16 * ancestry_coefficients["PC16"] +
                                              categorized_combined_data$PC17 * ancestry_coefficients["PC17"] +
                                              categorized_combined_data$PC18 * ancestry_coefficients["PC18"] +
                                              categorized_combined_data$PC19 * ancestry_coefficients["PC19"] +
                                              categorized_combined_data$PC20 * ancestry_coefficients["PC20"])
# Calculate the adjusted probabilities by removing the effects of C1 to C10
categorized_combined_data <- categorized_combined_data %>%
  mutate(adjusted_probability = plogis(adjusted_scores))
# Extract PCA columns (PC1 to PC20)
pca_columns <- categorized_combined_data[, grep("^PC\\d+$", names(categorized_combined_data))]

# Perform GMM clustering
gmm_result <- Mclust(pca_columns, G = 5)  
# Add the cluster assignments to the original dataframe
categorized_combined_data$cluster_gmm <- as.factor(gmm_result$classification)

# Visualize the clustering results
ggplot(categorized_combined_data, aes(x = PC1, y = PC2, color = cluster_gmm)) +
  geom_point() +
  ggtitle("GMM Clustering of PCA Data") +
  theme_minimal() +
  scale_color_manual(values = rainbow(length(unique(gmm_result$classification)))) +
  theme(legend.position = "bottom")

contingency_table <- table(categorized_combined_data$cluster_gmm, categorized_combined_data$Ancestry)

contingency_df <- as.data.frame(contingency_table)
colnames(contingency_df) <- c("Cluster", "Ancestry", "Count")

# Bar plot to show the makeup of each cluster by ancestry
ggplot(contingency_df, aes(x = Cluster, y = Count, fill = Ancestry)) +
  geom_bar(stat = "identity", position = "dodge") +
  ggtitle("Makeup of Each Cluster by Ancestry") +
  xlab("Cluster") + ylab("Count") +
  theme_minimal() +
  theme(legend.position = "bottom")

# Define the mapping of clusters to ancestry
cluster_to_ancestry <- c("1" = "EUR", "2" = "HIS", "4" = "AFR", "3" = "OTH", "5" = "OTH")

# Add the new column to the dataframe
categorized_combined_data$cluster_gmm_Ancestry <- as.character(categorized_combined_data$cluster_gmm)

# Reassign the clusters based on the mapping
categorized_combined_data$cluster_gmm_Ancestry <- sapply(categorized_combined_data$cluster_gmm, function(x) cluster_to_ancestry[as.character(x)])

# Convert the new column to a factor
categorized_combined_data$cluster_gmm_Ancestry <- factor(categorized_combined_data$cluster_gmm_Ancestry, levels = unique(cluster_to_ancestry))

# Create a summary dataframe with counts
summary_data <- categorized_combined_data %>%
  group_by(Category, cluster_gmm_Ancestry) %>%
  summarise(count = n()) %>%
  ungroup()
`summarise()` has grouped output by 'Category'. You can override using the
`.groups` argument.
# Define the Wes Anderson palette
palette_colors <- wes_palette("FantasticFox1", length(unique(categorized_combined_data$cluster_gmm_Ancestry)), type = "continuous")


# Create a bar plot with percentages and counts
ancestry_makeup <- ggplot(categorized_combined_data, aes(x = Category, fill = cluster_gmm_Ancestry)) +
  geom_bar(position = "fill") +
  scale_y_continuous(labels = scales::percent) +
  ggtitle("Makeup of cluster_gmm_Ancestry within Each Category") +
  xlab("Category") +
  ylab("Percentage") +
  theme(legend.position = "bottom") +
  scale_fill_manual(values = palette_colors) +
  geom_text(data = summary_data, aes(label = count, y = count / sum(count), group = cluster_gmm_Ancestry),
            position = position_fill(vjust = 0.5), color = "black") + 
  theme_cowplot(12)


# Visualize the clustering results
clusters <- ggplot(categorized_combined_data, aes(x = PC1, y = PC2, color = cluster_gmm_Ancestry)) +
  geom_point() +
  ggtitle("GMM Clustering of PCA Data") +
  theme_minimal() +
  scale_color_manual(values = palette_colors) +
  theme(legend.position = "bottom") + 
  theme_cowplot(12)


ancestry_scores <- ggplot(categorized_combined_data, aes(x = cluster_gmm_Ancestry, y = scores, fill = Category)) +
  geom_boxplot(outlier.shape = NA, notch = TRUE) +
  labs(title = "Scores by cluster_gmm_Ancestry and Category",
       x = "cluster_gmm_Ancestry",
       y = "Scores") +
      scale_fill_manual(values = group_colors) +
  theme(legend.position = "bottom") +
  theme_cowplot(12) +
  ylim(-3, 3)

ancestry_adjusted_scores <- ggplot(categorized_combined_data, aes(x = cluster_gmm_Ancestry, y = adjusted_scores, fill = Category)) +
     geom_boxplot(outlier.shape = NA, notch = TRUE) +
     labs(title = "Adjusted Scores by cluster_gmm_Ancestry and Category",
          x = "cluster_gmm_Ancestry",
          y = "Adjusted Scores") +
      scale_fill_manual(values = group_colors) +
     theme(legend.position = "bottom")+ 
  theme_cowplot(12) +
  ylim(-3, 3)

ancestry_adjusted__total_scores <- ggplot(categorized_combined_data, aes(x = Category, y = adjusted_scores, fill = Category)) +
     geom_boxplot(outlier.shape = NA, notch = TRUE) +
     labs(title = "Adjusted Scores by cluster_gmm_Ancestry and Category",
          x = "cluster_gmm_Ancestry",
          y = "Adjusted Scores") +
      scale_fill_manual(values = group_colors) +
     theme(legend.position = "bottom")+ 
  theme_cowplot(12) +
  ylim(-3, 3)

clusters

ancestry_makeup

suppressWarnings(print(ancestry_scores))

suppressWarnings(print(ancestry_adjusted_scores))

suppressWarnings(print(ancestry_adjusted__total_scores))

wilcox.test(adjusted_scores ~ Category, data = categorized_combined_data)

    Wilcoxon rank sum test with continuity correction

data:  adjusted_scores by Category
W = 92675, p-value < 2.2e-16
alternative hypothesis: true location shift is not equal to 0

Odds ratios in the highest quintile across ancestry, adjusted

categorized_combined_data <- categorized_combined_data %>%
  mutate(
    rank_adjusted = rank(adjusted_scores, ties.method = "first"),
    score_adjusted_quintiles = ceiling(rank_adjusted / (n() / 5))
  )

#subset by ancestry
AFR <- categorized_combined_data[categorized_combined_data$cluster_gmm_Ancestry == "AFR", ]
HIS <- categorized_combined_data[categorized_combined_data$cluster_gmm_Ancestry == "HIS", ]
EUR <- categorized_combined_data[categorized_combined_data$cluster_gmm_Ancestry == "EUR", ]
OTH <- categorized_combined_data[categorized_combined_data$cluster_gmm_Ancestry == "OTH", ]

# Apply the OR funciton you made way earlier
combined_odds_summary = calculate_odds_ratios(categorized_combined_data, Category, score_quintiles)
combined_odds_summary_AFR = calculate_odds_ratios(AFR, Category, score_quintiles)
combined_odds_summary_HIS = calculate_odds_ratios(HIS, Category, score_quintiles)
combined_odds_summary_EUR = calculate_odds_ratios(EUR, Category, score_quintiles)
combined_odds_summary_OTH = calculate_odds_ratios(OTH, Category, score_quintiles)

combined_odds_summary
# A tibble: 5 × 8
  score_quintiles count_category_1 count_category_2_6  odds odds_ratio
            <dbl>            <int>              <int> <dbl>      <dbl>
1               1              160                 48 0.3         1   
2               2              137                 72 0.526       1.75
3               3              112                 96 0.857       2.86
4               4               96                113 1.18        3.92
5               5               32                177 5.53       18.4 
# ℹ 3 more variables: se_log_odds_ratio <dbl>, lower_ci <dbl>, upper_ci <dbl>
combined_odds_summary_AFR 
# A tibble: 5 × 8
  score_quintiles count_category_1 count_category_2_6  odds odds_ratio
            <dbl>            <int>              <int> <dbl>      <dbl>
1               1               77                 27 0.351       1   
2               2               51                 22 0.431       1.23
3               3               42                 22 0.524       1.49
4               4               22                 15 0.682       1.94
5               5                5                 35 7          20.0 
# ℹ 3 more variables: se_log_odds_ratio <dbl>, lower_ci <dbl>, upper_ci <dbl>
combined_odds_summary_HIS 
# A tibble: 5 × 8
  score_quintiles count_category_1 count_category_2_6  odds odds_ratio
            <dbl>            <int>              <int> <dbl>      <dbl>
1               1               74                 11 0.149       1   
2               2               47                 12 0.255       1.72
3               3               29                 15 0.517       3.48
4               4               37                 11 0.297       2   
5               5               13                  9 0.692       4.66
# ℹ 3 more variables: se_log_odds_ratio <dbl>, lower_ci <dbl>, upper_ci <dbl>
combined_odds_summary_EUR 
# A tibble: 5 × 8
  score_quintiles count_category_1 count_category_2_6  odds odds_ratio
            <dbl>            <int>              <int> <dbl>      <dbl>
1               1                8                  5 0.625       1   
2               2               28                 31 1.11        1.77
3               3               33                 46 1.39        2.23
4               4               34                 76 2.24        3.58
5               5               13                119 9.15       14.6 
# ℹ 3 more variables: se_log_odds_ratio <dbl>, lower_ci <dbl>, upper_ci <dbl>
combined_odds_summary_OTH 
# A tibble: 5 × 8
  score_quintiles count_category_1 count_category_2_6   odds odds_ratio
            <dbl>            <int>              <int>  <dbl>      <dbl>
1               1                1                  5  5          1    
2               2               11                  7  0.636      0.127
3               3                8                 13  1.62       0.325
4               4                3                 11  3.67       0.733
5               5                1                 14 14          2.8  
# ℹ 3 more variables: se_log_odds_ratio <dbl>, lower_ci <dbl>, upper_ci <dbl>
# Apply the OR funciton you made way earlier

combined_odds_summary_adjust= calculate_odds_ratios(categorized_combined_data, Category, score_adjusted_quintiles)
combined_odds_summary_AFR_adjust = calculate_odds_ratios(AFR, Category, score_adjusted_quintiles)
combined_odds_summary_HIS_adjust = calculate_odds_ratios(HIS, Category, score_adjusted_quintiles)
combined_odds_summary_EUR_adjust = calculate_odds_ratios(EUR, Category, score_adjusted_quintiles)
combined_odds_summary_OTH_adjust = calculate_odds_ratios(OTH, Category, score_adjusted_quintiles)

combined_odds_summary_adjust
# A tibble: 5 × 8
  score_adjusted_quintiles count_category_1 count_category_2_6  odds odds_ratio
                     <dbl>            <int>              <int> <dbl>      <dbl>
1                        1              134                 74 0.552       1   
2                        2              132                 77 0.583       1.06
3                        3              115                 93 0.809       1.46
4                        4              106                103 0.972       1.76
5                        5               50                159 3.18        5.76
# ℹ 3 more variables: se_log_odds_ratio <dbl>, lower_ci <dbl>, upper_ci <dbl>
combined_odds_summary_AFR_adjust 
# A tibble: 5 × 8
  score_adjusted_quintiles count_category_1 count_category_2_6  odds odds_ratio
                     <dbl>            <int>              <int> <dbl>      <dbl>
1                        1               51                 24 0.471      1    
2                        2               47                 14 0.298      0.633
3                        3               43                 26 0.605      1.28 
4                        4               45                 26 0.578      1.23 
5                        5               11                 31 2.82       5.99 
# ℹ 3 more variables: se_log_odds_ratio <dbl>, lower_ci <dbl>, upper_ci <dbl>
combined_odds_summary_HIS_adjust 
# A tibble: 5 × 8
  score_adjusted_quintiles count_category_1 count_category_2_6  odds odds_ratio
                     <dbl>            <int>              <int> <dbl>      <dbl>
1                        1               54                  8 0.148       1   
2                        2               48                 11 0.229       1.55
3                        3               29                 14 0.483       3.26
4                        4               41                 12 0.293       1.98
5                        5               28                 13 0.464       3.13
# ℹ 3 more variables: se_log_odds_ratio <dbl>, lower_ci <dbl>, upper_ci <dbl>
combined_odds_summary_EUR_adjust 
# A tibble: 5 × 8
  score_adjusted_quintiles count_category_1 count_category_2_6  odds odds_ratio
                     <dbl>            <int>              <int> <dbl>      <dbl>
1                        1               26                 34  1.31       1   
2                        2               31                 45  1.45       1.11
3                        3               34                 46  1.35       1.03
4                        4               15                 54  3.6        2.75
5                        5               10                 98  9.8        7.49
# ℹ 3 more variables: se_log_odds_ratio <dbl>, lower_ci <dbl>, upper_ci <dbl>
combined_odds_summary_OTH_adjust 
# A tibble: 5 × 8
  score_adjusted_quintiles count_category_1 count_category_2_6   odds odds_ratio
                     <dbl>            <int>              <int>  <dbl>      <dbl>
1                        1                3                  8  2.67       1    
2                        2                6                  7  1.17       0.438
3                        3                9                  7  0.778      0.292
4                        4                5                 11  2.2        0.825
5                        5                1                 17 17          6.38 
# ℹ 3 more variables: se_log_odds_ratio <dbl>, lower_ci <dbl>, upper_ci <dbl>
# Extracting quintile 5 data for each summary
quintile_5_data <- data.frame(
  group = c("Total","AFR", "HIS", "EUR", "OTH"),
  odds_ratio = c(combined_odds_summary$odds_ratio[5],
                 combined_odds_summary_AFR$odds_ratio[5],
                 combined_odds_summary_HIS$odds_ratio[5],
                 combined_odds_summary_EUR$odds_ratio[5],
                 combined_odds_summary_OTH$odds_ratio[5]),
  lower_ci = c(combined_odds_summary$lower_ci[5],
               combined_odds_summary_AFR$lower_ci[5],
               combined_odds_summary_HIS$lower_ci[5],
               combined_odds_summary_EUR$lower_ci[5],
               combined_odds_summary_OTH$lower_ci[5]),
  upper_ci = c(combined_odds_summary$upper_ci[5],
               combined_odds_summary_AFR$upper_ci[5],
               combined_odds_summary_HIS$upper_ci[5],
               combined_odds_summary_EUR$upper_ci[5],
               combined_odds_summary_OTH$upper_ci[5])
)

# Log2 transformation
quintile_5_data$log2_odds_ratio <- log2(quintile_5_data$odds_ratio)
quintile_5_data$log2_lower_ci <- log2(quintile_5_data$lower_ci)
quintile_5_data$log2_upper_ci <- log2(quintile_5_data$upper_ci)

# Define the y-limits based on the transformed data
ylim_range <- range(-2, 6)

# Plotting
plot(quintile_5_data$log2_odds_ratio, ylim = ylim_range, pch = 19, xlab = "Group", ylab = "Log2(Odds Ratio)", 
     main = "Log2(Odds Ratio) for Quintile 5 Across Different Groups", xaxt = "n", col = "#3C8474")
axis(1, at = 1:5, labels = quintile_5_data$group)

# Add error bars for 95% CI
arrows(x0 = 1:5, y0 = quintile_5_data$log2_lower_ci, y1 = quintile_5_data$log2_upper_ci, 
       code = 3, angle = 90, length = 0.05, col = "#3C8C78")

# Extract quintile 5 data for each summary
quintile_5_data <- data.frame(
  group = c("Total","AFR", "HIS", "EUR", "OTH"),
  odds_ratio = c(combined_odds_summary_adjust$odds_ratio[5],
                 combined_odds_summary_AFR_adjust$odds_ratio[5],
                 combined_odds_summary_HIS_adjust$odds_ratio[5],
                 combined_odds_summary_EUR_adjust$odds_ratio[5],
                 combined_odds_summary_OTH_adjust$odds_ratio[5]),
  lower_ci = c(combined_odds_summary_adjust$lower_ci[5],
               combined_odds_summary_AFR_adjust$lower_ci[5],
               combined_odds_summary_HIS_adjust$lower_ci[5],
               combined_odds_summary_EUR_adjust$lower_ci[5],
               combined_odds_summary_OTH_adjust$lower_ci[5]),
  upper_ci = c(combined_odds_summary_adjust$upper_ci[5],
               combined_odds_summary_AFR_adjust$upper_ci[5],
               combined_odds_summary_HIS_adjust$upper_ci[5],
               combined_odds_summary_EUR_adjust$upper_ci[5],
               combined_odds_summary_OTH_adjust$upper_ci[5])
)

# Log2 transformation
quintile_5_data$log2_odds_ratio <- log2(quintile_5_data$odds_ratio)
quintile_5_data$log2_lower_ci <- log2(quintile_5_data$lower_ci)
quintile_5_data$log2_upper_ci <- log2(quintile_5_data$upper_ci)

# Define the y-limits based on the transformed data
ylim_range <- range(-2, 6)

# Plot
plot(quintile_5_data$log2_odds_ratio, ylim = ylim_range, pch = 19, xlab = "Group", ylab = "Log2(Odds Ratio)", 
     main = "Log2(Odds Ratio) for Quintile 5 Across Different Groups, adjusted", xaxt = "n", col = "black")
axis(1, at = 1:5, labels = quintile_5_data$group)

# Add error bars for 95% CI
arrows(x0 = 1:5, y0 = quintile_5_data$log2_lower_ci, y1 = quintile_5_data$log2_upper_ci, 
       code = 3, angle = 90, length = 0.05, col = "black")

# Convert 'Category' into a binary format: 0 for control (1), 1 for case (2-4)
categorized_combined_data$binary_outcome <- ifelse(categorized_combined_data$Category == "Control", 0, 1)
roc_result_full_adjusted <- roc(categorized_combined_data$binary_outcome, categorized_combined_data$adjusted_probability, plot = TRUE)
Setting levels: control = 0, case = 1
Setting direction: controls < cases

auc(roc_result_full_adjusted)
Area under the curve: 0.6589
# Calculate the adjusted probabilities by removing the effects of C1 to C10
categorized_replication_data$adjusted_scores_replication <- categorized_replication_data$scores_replication - 
                                             (ancestry_coefficients["(Intercept)"] + 
                                              categorized_replication_data$PC1 * ancestry_coefficients["PC1"] + 
                                              categorized_replication_data$PC2 * ancestry_coefficients["PC2"] + 
                                              categorized_replication_data$PC3 * ancestry_coefficients["PC3"] + 
                                              categorized_replication_data$PC4 * ancestry_coefficients["PC4"] + 
                                              categorized_replication_data$PC5 * ancestry_coefficients["PC5"] +
                                              categorized_replication_data$PC6 * ancestry_coefficients["PC6"] +
                                              categorized_replication_data$PC7 * ancestry_coefficients["PC7"] +
                                              categorized_replication_data$PC8 * ancestry_coefficients["PC8"] +
                                              categorized_replication_data$PC9 * ancestry_coefficients["PC9"] +
                                              categorized_replication_data$PC10 * ancestry_coefficients["PC10"] + 
                                              categorized_replication_data$PC11 * ancestry_coefficients["PC11"] + 
                                              categorized_replication_data$PC12 * ancestry_coefficients["PC12"] + 
                                              categorized_replication_data$PC13 * ancestry_coefficients["PC13"] + 
                                              categorized_replication_data$PC14 * ancestry_coefficients["PC14"] + 
                                              categorized_replication_data$PC15 * ancestry_coefficients["PC15"] +
                                              categorized_replication_data$PC16 * ancestry_coefficients["PC16"] +
                                              categorized_replication_data$PC17 * ancestry_coefficients["PC17"] +
                                              categorized_replication_data$PC18 * ancestry_coefficients["PC18"] +
                                              categorized_replication_data$PC19 * ancestry_coefficients["PC19"] +
                                              categorized_replication_data$PC20 * ancestry_coefficients["PC20"])


# Calculate the adjusted probabilities by removing the effects of C1 to C10
categorized_replication_data <- categorized_replication_data %>%
  mutate(adjusted_probability_replication = plogis(adjusted_scores_replication))


# Compute the ROC curve
roc_result_full_adjusted <- roc(categorized_replication_data$binary_outcome, categorized_replication_data$adjusted_probability_replication)
Setting levels: control = 0, case = 1
Setting direction: controls < cases
auc(roc_result_full_adjusted)
Area under the curve: 0.8287
# Plot the ROC curve with the x-axis reversed
plot(roc_result_full_adjusted,xlim = c(1, 0))